spark steaming with kafka, cannot receive all data - apache-spark

In my spark streaming program, I try to receive data from kafka. In kafka producer, I send 1 million messages, but in the spark streaming, I cannot receive all the messages. It'll always loss some messages.
I start kafka-server with the default config.
Here is my producer's code:
val props = new HashMap[String, Object]()
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer")
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String, String](props)
for (i <- 1 to loop_times.toInt) {
var cnt = 0
while (cnt < record_count.toInt) {
val message = new ProducerRecord[String, String](topic, null, "aaa")
producer.send(message)
cnt += 1
if (cnt % 10000 == 0)
println(s"send $cnt records")
}
}
producer.close()
and here is my spark streaming code(this code is in spark's example):
object KafkaWordCount {
def main(args: Array[String]) {
if (args.length < 4) {
System.err.println("Usage: KafkaWordCount <zkQuorum> <group> <topics> <numThreads>")
System.exit(1)
}
StreamingExamples.setStreamingLogLevels()
val Array(zkQuorum, group, topics, numThreads) = args
val sparkConf = new SparkConf().setAppName("KafkaWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(2))
ssc.checkpoint("checkpoint")
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1L))
.reduceByKeyAndWindow(_ + _, _ - _, Minutes(10), Seconds(2), 2)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
My spark's version is 1.6 and kafka's is 0.8.2.1

Related

How to correctly read the corresponding columnChunk in parquet according to the specified schema?

1. I use a custom sparkSQL plugin to start a spark-shell terminal and execute the following command in it
import org.apache.spark.sql.{DataFrame, DataFrameReader, Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
sc.setLogLevel("WARN")
val spark: SparkSession = SparkSession.builder().appName("Test").config("parquet.enable.dictionary","false").getOrCreate()
val res = spark.sql("SELECT USER_ID FROM TEST_PARQUET_10G where USER_ID >= 0 and HOUR_ID > 0")
res.collect()
2. This is my table creation statement
CREATE
TABLE
TEST_PARQUET_10G(
USER_ID BIGINT,
SERIAL_NUMBER BIGINT,
KAFKA_TIME BIGINT,
ACCT_DATE BIGINT,
HOUR_ID BIGINT
)
COMMENT 'Long类型'
STORED AS PARQUET
TBLPROPERTIES('parquet.compression'='none')
LOCATION
'/user/hive/warehouse/hanlei/Long_PARQUET/TEST_PARQUET_10G'
3. I customized a sql plug-in, in which I rewritten the rules for reading paquet files this is my reader, in this step, I set USER_ID ADN HOUR_ID in my reader
this is my reader code
private def createParquetFileReader(file: PartitionedFile): ParquetFileReader = {
val conf = broadcastedConf.value.value
val filePath = new Path(file.filePath)
val split = new FileSplit(filePath, file.start, file.length, Array.empty[String])
val reader = ParquetFileReader.open(HadoopInputFile.fromPath(filePath,conf))
val requiredNameArr = readDataSchema.fieldNames
val requiredSchemaName:Set[String] = requiredNameArr.toSet
val fields: util.List[Type] = reader.getFileMetaData.getSchema.getFields
fields.removeIf{
field =>
!requiredSchemaName.contains(field.getName)
}
requiredSchema = new MessageType("requiredSchema", fields)
reader.setRequestedSchema(requiredSchema)
reader
}
4. But when I call the readNextRowGroup method of the reader, the program reports this error
code from here
class ParquetDataPagesPartitionReader(
parquetReader: ParquetFileReader,
readDataSchema: StructType,
requiredSchema: MessageType
) extends PartitionReader[ColumnarBatch]
with ScanWithMetrics
with Arm
with Logging {
private var batch: Option[ColumnarBatch] = None
//private val columns: Iterator[String] = readDataSchema.fieldNames.iterator
var rowGroup: PageReadStore = _
val columns: util.List[ColumnDescriptor] = requiredSchema.getColumns
override def next(): Boolean = {
batch.foreach(_.close())
rowGroup = parquetReader.readNextRowGroup()
batch = if ( rowGroup != null) {
readBatch()
} else {
None
}
batch.isDefined
}
// one batch is all columnChunk's dataPages from a rowGroup
private def readBatch(): Option[ColumnarBatch] = {
val starTime = System.currentTimeMillis()
// val starTime = System.nanoTime()
logError(s"startTime ${starTime}")
val dpuColumnVectors: Array[DpuColumnVector] = new Array[DpuColumnVector](columns.size())
for (i <- 0 until readDataSchema.length) {
val raceDType = DpuColumnVector.getRaceDataType(readDataSchema(i).dataType)
val raceVec = new RaceColumnVector(raceDType, DpuBatchUtils.DPU_MAX_ROWS)
logDebug(s"try to malloc ${i} column")
raceVec.setRowSizeAndMalloc()
logDebug(s"successful malloc ${i} column")
dpuColumnVectors(i) = new DpuColumnVector(readDataSchema(i).dataType, raceVec)
}
var useTime: Long = 0
for (index <- 0 until readDataSchema.length) {
var offset: Int = 0
val pageReader = rowGroup.getPageReader(columns.get(index))
var page: DataPageV1 = pageReader.readPage.asInstanceOf[DataPageV1]
logDebug(s"appending data to column ${index}")
while (page != null && offset < DpuBatchUtils.DPU_MAX_ROWS) {
dpuColumnVectors(index).dataType() match {
case LongType =>
val startReadingIntoMemory = System.currentTimeMillis()
val byteArray: Array[Byte] = page.getBytes.toByteArray
val bytes: Array[Byte] = byteArray.slice(8, byteArray.length)
val endReadIntoMemory = System.currentTimeMillis()
//logError(s"successful read into memory column: ${columns.get(index).getPath.apply(0)} use time: ${endReadIntoMemory - startReadingIntoMemory}")
dpuColumnVectors(index).getRaceColumnVector
// todo: The byte array here is not decoded
.appendValuesFromBuffer(bytes, 0, page.getValueCount)
val endAppendToRace = System.currentTimeMillis()
//logError(s"successful read into race, appended ${page.getValueCount} rows from ${offset} use time ${endAppendToRace - endReadIntoMemory}")
case _ => throw new RuntimeException(RaceConstant.UNSUPPORTED_DATA_TYPE)
}
offset += page.getValueCount
page = pageReader.readPage.asInstanceOf[DataPageV1]
}
}
val endTime = System.currentTimeMillis()
// val endTime = System.nanoTime()
// useTime = useTime + endTime - starTime
val columnarBatch = new ColumnarBatch(dpuColumnVectors.toArray, rowGroup.getRowCount.toInt)
logError(s"endTime: ${endTime}")
logError(s"read batch success, write to race successful,batchRows: ${rowGroup.getRowCount}, column number: ${columns.size()}, use time: ${endTime - starTime}")
Some(columnarBatch)
}
override def get(): ColumnarBatch = {
val ret = batch.getOrElse(throw new NoSuchElementException)
batch = None
ret
}
override def close(): Unit = {
if (parquetReader != null) parquetReader.close()
batch.foreach(_.close())
batch = None
}
}
5. In this reader, I have the schema I injected, but the subscript is mapped according to the original schema
code injected HOUR_ID and USER_ID
6. Here is my call stack info
stack info
How do I read parquet's columnChunk through a custom mapping?

BCH Raw Transaction not working by using TrustWallet Library (implementation 'com.trustwallet:wallet-core:2.6.3')

{"error":"RPC is disabled. This version of Bitcoin Cash Node is old and
may be out of consensus with the network. It is recommended that you
upgrade. To proceed without upgrading, and re-enable the RPC interface,
restart the node with the configuration option expire=0."}
I am getting this Error while sending BCH using the Trust wallet Library also I updated the latest version (implementation 'com.trustwallet:wallet-core:2.6.3'). Kindly help me with this issue
Here is my BCH Transaction code
private fun getUtox(receiveAddress: String, finalreceive: String) {
showProgress()
if (bchutoxlist.size > 0) {
val seed1 = UtilsDefault.getSharedPreferenceString(Constants.SEED)
val wallet = HDWallet(seed1, "")
val coinBch = CoinType.BITCOINCASH
val address = wallet.getAddressForCoin(coinBch)
val value = finalreceive.toDouble()
val vv = value * 100000000
val myvalue = vv.toLong()
val secretPrivateKeyBtc = wallet.getKeyForCoin(coinBch)
val toAddress = receiveAddress
val changeAddress = address
val script = BitcoinScript.lockScriptForAddress(address, coinBch).data()
val input = Bitcoin.SigningInput.newBuilder().apply {
this.amount = myvalue
this.hashType = BitcoinScript.hashTypeForCoin(coinBch)
this.toAddress = toAddress
this.useMaxAmount = fullAmount
this.changeAddress = changeAddress
this.byteFee = btcgas.toLong()
}
for (i in 0..bchutoxlist.size.minus(1)) {
val txid = bchutoxlist[i].txid
val vout = bchutoxlist[i].vout
val satoshi = bchutoxlist[i].satoshis
val utox = Numeric.hexStringToByteArray(txid)
val utxoTxId = utox
val outPoint = Bitcoin.OutPoint.newBuilder().apply {
this.hash = ByteString.copyFrom(utxoTxId.reversedArray())
this.sequence = Long.MAX_VALUE.toInt()
this.index = vout!!
}.build()
val utxo = Bitcoin.UnspentTransaction.newBuilder().apply {
this.amount = satoshi!!
this.outPoint = outPoint
this.script = ByteString.copyFrom(script)
}.build()
input.addUtxo(utxo)
input.addPrivateKey(ByteString.copyFrom(secretPrivateKeyBtc.data()))
}
val output = AnySigner.sign(input.build(), coinBch, Bitcoin.SigningOutput.parser())
val signedTransaction = output.encoded?.toByteArray()
val hex = signedTransaction?.toHexString()
if (hex != "") {
sendBch(hex!!, address,myvalue)
} else {
runOnUiThread {
hideProgress()
Toast.makeText(
this#ConfirmDepositActivity,
getString(R.string.withdraw_failed),
Toast.LENGTH_SHORT
).show()
finish()
}
}
} else {
runOnUiThread {
hideProgress()
Toast.makeText(
this#ConfirmDepositActivity,
"No Unspent Amount",
Toast.LENGTH_SHORT
).show()
finish()
}
}
}

The same operation from spark and from shell hive, are different effect, Why?

This code insert data from spark
String warehouseLocation = new File("spark-warehouse").getAbsolutePath();
SparkSession sparkSession = SparkSession.builder()
.appName(appName)
.config("spark.sql.warehouse.dir", warehouseLocation)
.config("spark.sql.catalogImplementation","hive")
.enableHiveSupport()
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.getOrCreate();
JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()),
Durations.seconds(duration));
SQLContext sqlContext = sparkSession.sqlContext();
sqlContext.sql("CREATE TABLE IF NOT EXISTS " + tableName + " (value1 STRING, value2 STRING, value3 STRING, " +
"value4 STRING, value5 STRING, value6 STRING, value7 STRING) PARTITIONED BY (year STRING, mounth STRING, day STRING)" +
" STORED AS ORC");
sqlContext.sql("SET hive.merge.tezfiles=true");
sqlContext.sql("SET hive.merge.mapfiles=true");
sqlContext.sql( "SET hive.merge.size.per.task=256000000");
sqlContext.sql ( "SET hive.merge.smallfiles.avgsize=16000000");
sqlContext.sql("SET hive.merge.orcfile.stripe.level=true;");
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", broker);
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topicsSet = Collections.singletonList(topic);
// Create direct kafka stream with brokers and topics
JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(
jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(topicsSet, kafkaParams));
// Get the lines, split them into words, count the words and print
JavaDStream<String> lines = messages.map(ConsumerRecord::value);
lines.foreachRDD(new VoidFunction<JavaRDD<String>>() {
#Override
public void call(JavaRDD<String> rdd) {
if (!rdd.isEmpty()) {
JavaRDD<Data> dataRDD = rdd.map(new Function<String, Data>() {
#Override
public Data call(String msg) {
try {
return Data.insertDataByString(msg);
} catch (ParseException e) {
e.printStackTrace();
}
return null;
}
});
Dataset<Row> dataRow = sqlContext.createDataFrame(dataRDD, Data.class);
dataRow.createOrReplaceTempView("temp_table");
sqlContext.sql("insert into " + tableName + " partition(year,mounth,day) select value1, value2, " +
"value3, value4, value5, value6, value7, year, mounth, day from temp_table");
//dataRow.write().format("orc").partitionBy("year", "day").mode(SaveMode.Append).insertInto(tableName);
//sqlContext.sql("ALTER TABLE " + tableName + " PARTITION(year='2020', mounth='4', day='26') " + " CONCATENATE");
}
}
When execute this code, the table are created in
hdfs://master.vmware.local:8020/apps/spark/warehouse/tablename/year=2020/mounth=4/day=26
and into day=26 are present more file.c000
if instead, create table from hive shell the table is in other place,
hdfs://master.vmware.local:8020/warehouse/tablespace/managed/hive/table_name/year=2020/mounth=4/day=26/
and into day=26 are present the file: _orc_acid_version and _bucket_000000
My objective is create orc file with spark, but i think that with spark i am saving with hive's file default.
How can I do to save data from spark with hive to ocr file ?

using updateStateWithKey on a case class

I am trying to write an updateStateWithKey for getting the first values on an input. When I try to use it on a case class, I get an error
import org.apache.log4j.{Level, Logger}
import org.apache.spark._
import org.apache.spark.streaming._
case class Persons(name : String, school : String)
object StatefulNetworkWordCount {
def getPerson (str : String) : Persons = {
val splitArray = str.split(",")
val name = splitArray(0)
val school = splitArray(1)
Persons(name, school)
}
//Now, newValues is the new set of values
//runningCount is the existing values for each key
def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
val newCount = runningCount.getOrElse(0) + newValues.sum
Some(newCount)
}
def updateFunctionFrist(newValues: Seq[String], runningCount: Option[String]): Option[String] =
{
val newWord = if (runningCount.getOrElse("") == "")
{
val str = newValues.head.toString //Use existing values
Some(str)
}
else
{
val str = runningCount.getOrElse(newValues.head.toString)
Some(str)
}
newWord
}
def updateFunctionFirstPerson(newValues: Seq[Person], state: Option[Person]): Option[Person] =
{
val newWord = if (state.getOrElse("") == "") //If running count is empty
{
val str = newValues.head.asInstanceOf[Person]
Some(str)
}
else
{
val str = state.getOrElse(newValues.head.asInstanceOf[Person])
Some(str)
}
newWord
}
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setMaster("local[8]").setAppName("StatefulNetworkWordCount")
val ssc = new StreamingContext(conf, Seconds(10))
// Set checkpoint directory
ssc.checkpoint(".")
// Create a DStream that will connect to hostname:port, like localhost:9999
val lines = ssc.socketTextStream("localhost", 9999)
// Split each line into words
val words = lines.flatMap(_.split(" "))
// Count each word in each batch
val pairs = words.map(word => (word.hashCode, word))
val runningCounts = pairs.updateStateByKey[Persons](updateFunctionFirstPerson _)
runningCounts.print()
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
The line
val runningCounts = pairs.updateStateByKey[Persons](updateFunctionFirstPerson _)
throws an error, but if I use the
val runningCounts = pairs.updateStateByKey[String](updateFunctionFirst _)
to get the first value for a key it works fine. Can we use custom class in updateStateBykey? How can i use it?

Looking for Streaming Spark ML example with raw data as input

I am newbie to Spark ML. I am looking for Streaming ml example with raw data as input (I mean raw String delimited data, not the vectorized data).
I tried to look at most of the forums to find the similar example, I couldn't find any.
So, I approached the following way for Streaming Kmeans on Spark 1.6 (Streaming Kmeans still works on vector data rather than dataframes),
But I am not sure, if this is the right way.
Input Dstream records from Kafka, I am converting that to Dataframe.
Built a pre data pipeline to read the columns I need to convert that to vectors.
Since in every Stream batch, data is different and Vector length may differ, So, I am using Tokenizer and HashingTF to maintain constant vector length.
Also in each Stream batch, to identify the vector belongs to which row, I have assigned the row number and sending that to Streaming Kmeans algorithm
And after getting the cluster ID for each row, I am joining the cluster ID to the row data to get the final predicted dataframe.
Now I have doubts in the way I implemented this.
To my understanding, using the HashingTF with fixed vector length (setNumFeatures) will not solve my issue, the reason being the same vectors may repeat again, if the combination of vector is less than the number of rows in a single batch (hard-coded it to 200 right now in my below code)
Also I tried StringIndexer with OneHotEncoder for generation of vectors, but I see vector length is differing for each batch.
Since we have way a head assign RandomCenters for StreamingKMeans, I have to know the vector length prior to my execution (for my testing data and columns, since I getting vector length as 800, I hard-coded it for now for testing)
Though we have to send training vectors with all combinations of data separately for train the model, right now I am using both the training and testing vectors as same.
Can someone guide me or share me some streaming ml example with raw data as input ?
import com.common.Configuration._
import com.twitter.bijection.Injection
import com.twitter.bijection.avro.GenericAvroCodecs
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import scala.util.Try;
object KafkaDataConsumer {
//localOrCluster=l streamBatchSeconds=2 locationToSaveStream=tmp3 isAvroTopic=text hiveTableName= modelName=streamkmeans hiveORFile=file locationToSaveModelData=tmp1 predictedDataLocation=tmp2 labelCol=salary nonStringCols= stringCols=workclass,age,education_num,hours_per_week csvOrHiveForTrain=data/mllib/adult.csv csvOrHiveForTest=data/mllib/adult.csv noOfNodes=10 noOfIter=10 locationToStorePmml=pmmlfolder runMLPipeline=true isOverwriteDataOk=no
// Setting the logs levels
setLogLevels(Level.WARN, Seq("spark", "org", "akka"))
def main(args: Array[String]) {
executeStreamdata(args)
}
def executeStreamdata(args: Array[String]) {
val namedArgs = getNamedArgs(args)
val localOrCluster = namedArgs("localOrCluster")
val streamBatchSeconds = namedArgs("streamBatchSeconds").toInt
val isAvroTopic = namedArgs("isAvroTopic").equalsIgnoreCase("avro")
val kafkaBrokerList = kafkaConfig.getString("kafkaBrokerList")
val kafkaTopicList =
if (isAvroTopic)
kafkaConfig.getString("TOPIC1_NAME")
else
kafkaConfig.getString("TOPIC2_NAME")
val SCHEMA =
if (isAvroTopic)
kafkaConfig.getString("TOPIC1_SCHEMA")
else
kafkaConfig.getString("TOPIC2_SCHEMA")
val labelCol = namedArgs("labelCol")
val nonStringCols = namedArgs("nonStringCols")
val stringCols = namedArgs("stringCols")
val noOfNodes = namedArgs("noOfNodes").toInt
val predictedDataLocation = namedArgs("predictedDataLocation")
val sc = getSparkContext(localOrCluster, "kafka stream application")
val ssc = new StreamingContext(sc, Seconds(streamBatchSeconds))
val sqlContext = new SQLContext(sc)
val topicsSet = kafkaTopicList.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList)
//Read Stream records either as AVRO or String
val dStreamRecords =
if (isAvroTopic) {
val parser = new Schema.Parser
val schema = parser.parse(SCHEMA)
val recordInjection: Injection[GenericRecord, Array[Byte]] = GenericAvroCodecs.toBinary(schema)
KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](ssc, kafkaParams, topicsSet)
.map(message => recordInjection.invert(message._2).get.toString)
}
else {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet).map(_._2)
}
println("Executing Streaming KMeans")
//Pipeline for generating the label, features vector using StringTokenizer and HashFunction based on input label Columns, string and nonString Columns
val pipelineStagesWithAssembler = buildDataPrepPipeLine(sqlContext, labelCol, stringCols, nonStringCols, true)
val pipeline = new Pipeline().setStages(pipelineStagesWithAssembler)
var vectorsDFSchema:StructType = null
def createDFFromDStream(rdd: RDD[String]): DataFrame = {
if (isAvroTopic)
sqlContext.read.json(rdd)
else
sqlContext.createDataFrame(rdd.map(x => Row.fromSeq(x.split(",").toSeq)), createSchemaFrmStrDelValues(SCHEMA))
}
val vectorDataWithRowNum = dStreamRecords.transform(rdd=>{
//Create dataframe from the input data
val dataframe = createDFFromDStream(rdd)
//Get the pipeline model
val pipelineModel = pipeline.fit(dataframe)
//Get label, features vectors in the DataF
val vectorsDF = pipelineModel.transform(dataframe)
//get the row number for the dataframe
val vectorsDFWithRowNum = dfWithRowIndexUsingRDD(vectorsDF)
vectorsDFSchema = StructType(vectorsDFWithRowNum.schema.filter(!_.dataType.toString.contains("Vector")))
//Get row number, vectors , row data
val reqData = vectorsDFWithRowNum.map(row => {
(row.getAs[Long]("row_num"), row.getAs[org.apache.spark.mllib.linalg.Vector]("features"), row)
})
reqData
})
// get train data(vector) , test data (row number or index, vector), row data (row number or index, row data)
val (dStreamTrainVector, dStreamTestVector, rowData) = (vectorDataWithRowNum.map(_._2), vectorDataWithRowNum.map(data=>(data._1, data._2)), vectorDataWithRowNum.map(data=>(data._1, data._3)))
// Need to Fix the Random Centers hardcoded 800
val model = new StreamingKMeans()
.setK(noOfNodes)
.setDecayFactor(1.0)
.setRandomCenters(800, 0)
// Train the Kmeans model
model.trainOn(dStreamTrainVector)
// Predict vales from the model
val joinData = model.predictOnValues(dStreamTestVector).join(rowData)
joinData.foreachRDD(rdd=> {
val seqData = rdd.map(x=>{
val id = x._1
val clusterID = x._2._1
val rowData = x._2._2.toSeq.filter(!_.isInstanceOf[org.apache.spark.mllib.linalg.SparseVector])
Row.fromSeq(rowData :+ clusterID.toString)
})
val dfPredictions = sqlContext.createDataFrame(seqData, vectorsDFSchema.add("clusterID", StringType))
dfPredictions.printSchema()
dfPredictions.show()
dfPredictions.write.mode(SaveMode.Append).save(predictedDataLocation)
})
ssc.start
ssc.awaitTermination
}
private def buildOneHotPipeLine(colName:String, isStreamJob:Boolean = false):Array[PipelineStage] = {
if(isStreamJob){
val tokenizer = new org.apache.spark.ml.feature.Tokenizer()
.setInputCol(s"${colName}")
.setOutputCol(s"${colName}_token")
val hashingTF = new org.apache.spark.ml.feature.HashingTF()
.setInputCol(s"${colName}_token")
.setOutputCol(s"${colName}_hashFeature").setNumFeatures(200)
Array(tokenizer, hashingTF)
}else {
val stringIndexer = new StringIndexer()
.setInputCol(s"$colName")
.setOutputCol(s"${colName}_index")
val oneHotEncoder = new OneHotEncoder()
.setInputCol(s"${colName}")
.setOutputCol(s"${colName}_onehotindex")
Array(stringIndexer, oneHotEncoder)
}
}
def buildDataPrepPipeLine(sqlContext: SQLContext, lableCol: String, stringCols: String, nonStringCols: String, isStreamJob:Boolean = false):Array[PipelineStage] = {
var pipelineStagesforFeatures : Array[PipelineStage]= null
var assemblerInputCols :Array[String] = null
if(stringCols != null && !stringCols.isEmpty ){
val stringColsArray = stringCols.split(",")
if(pipelineStagesforFeatures != null) {
pipelineStagesforFeatures = pipelineStagesforFeatures ++ stringColsArray.map(columnName => buildOneHotPipeLine(columnName, isStreamJob)).reduce(_ ++ _)
}else{
pipelineStagesforFeatures = stringColsArray.map(columnName => buildOneHotPipeLine(columnName, isStreamJob)).reduce(_++_)
}
if(assemblerInputCols != null) {
assemblerInputCols = assemblerInputCols ++ stringColsArray.map(colName => if(isStreamJob) s"${colName}_hashFeature" else s"${colName}_onehotindex")
}else{
assemblerInputCols = stringColsArray.map(colName => if(isStreamJob) s"${colName}_hashFeature" else s"${colName}_onehotindex")
}
}
if(nonStringCols != null && !nonStringCols.isEmpty ){
val nonStringColsArray = nonStringCols.split(",")
if(assemblerInputCols != null) {
assemblerInputCols = assemblerInputCols ++ nonStringColsArray
}else{
assemblerInputCols = nonStringColsArray
}
}
// Combine all the features and make it a single Feature
val assembler = new VectorAssembler()
.setInputCols(assemblerInputCols)
.setOutputCol("features")
var labelIndexer : StringIndexer= null
if(lableCol != null && !lableCol.isEmpty){
labelIndexer = new StringIndexer()
labelIndexer.setInputCol(lableCol)
labelIndexer.setOutputCol("label")
}
val pipelineStagesWithAssembler =
if(stringCols != null && !stringCols.isEmpty &&
nonStringCols != null && !nonStringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty) {
pipelineStagesforFeatures.toList ::: List(assembler,labelIndexer)
}else if( stringCols != null && !stringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler,labelIndexer)
} else if( nonStringCols != null && !nonStringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty){
List(assembler,labelIndexer)
}else if( stringCols != null && !stringCols.isEmpty &&
nonStringCols != null && !nonStringCols.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler)
}else if( stringCols != null && !stringCols.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler)
}else{
List(assembler)
}
pipelineStagesWithAssembler.toArray
}
def dfWithRowIndexUsingRDD(df: DataFrame, offset: Int = 1, colName: String = "row_num", inFront: Boolean = true): DataFrame = {
df.sqlContext.createDataFrame(
df.rdd.zipWithIndex.map(ln =>
Row.fromSeq(
(if (inFront) Seq(ln._2 + offset) else Seq())
++ ln._1.toSeq ++
(if (inFront) Seq() else Seq(ln._2 + offset))
)
),
StructType(
(if (inFront) Array(StructField(colName, LongType, false)) else Array[StructField]())
++ df.schema.fields ++
(if (inFront) Array[StructField]() else Array(StructField(colName, LongType, false)))
)
)
}
def getSparkContext(runLocal: String, appName: String) = {
val sc: SparkContext = if (runLocal.equalsIgnoreCase("local") || runLocal.equalsIgnoreCase("l")) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local[1]", appName, sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName(appName)
new SparkContext(sparkConfig)
}
sc.hadoopConfiguration.setBoolean("parquet.enable.summary-metadata", false)
sc
}
def createSchemaFrmStrDelValues(baseSchema: String): StructType = {
return StructType(baseSchema.split(",").map(f => StructField(f, StringType, true)))
}
def getNamedArgs(args: Array[String]): Map[String, String] = {
println("################### Input parameters are ############### " + args.toList)
args.filter(line => line.contains("=")) //take only named arguments
.map(x => {
val key = x.substring(0, x.indexOf("="))
val value = x.substring(x.indexOf("=") + 1)
(key, if (value == null || "".equalsIgnoreCase(value)) null else value)
}).toMap //convert to a map
}
def setLogLevels(level: Level, loggers: Seq[String]): Map[String, Level] = loggers.map(loggerName => {
val logger = Logger.getLogger(loggerName)
val prevLevel = logger.getLevel
logger.setLevel(level)
loggerName -> prevLevel
}).toMap
def pullDataFromCSVFile(sqlContext: SQLContext, isHeaderExist: Boolean, filePath: String, delimiter: String, csvSplit: String): DataFrame = {
var csvDataFrame: DataFrame = null
try {
if (isHeaderExist) {
csvDataFrame = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load(filePath)
} else {
if (csvSplit != null) {
val schema = createSchemaFrmStrDelValues(csvSplit)
csvDataFrame = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "false")
.option("delimiter", delimiter)
.option("inferSchema", "false")
.schema(schema)
.load(filePath)
}
}
} catch {
case ex: Exception => {
println("Unable to read the CSV file from the location " + filePath)
ex.printStackTrace()
throw ex
}
}
csvDataFrame
}
def executeBatch(args: Array[String]) {
val namedArgs = getNamedArgs(args)
val runLocal = namedArgs("clusterORLocal")
val sc: SparkContext = getSparkContext(runLocal, "Ml Pipeline")
val csvOrHiveForTrain = namedArgs("csvOrHiveForTrain")
val csvOrHiveForTest = namedArgs("csvOrHiveForTest")
val locationToSaveModelData = namedArgs("locationToSaveModelData")
val labelCol = namedArgs("labelCol")
val featureCols = namedArgs("featureCols").toString.split(",").map(_.toLowerCase)
val noOfNodes = namedArgs("noOfNodes").toInt
val noOfIter = namedArgs("noOfIter").toInt
val sqlContext = new SQLContext(sc);
val trainDF = pullDataFromCSVFile(sqlContext, true, csvOrHiveForTrain, null, null)
trainDF.printSchema()
val testDF = pullDataFromCSVFile(sqlContext, true, csvOrHiveForTest, null, null)
testDF.printSchema()
val rootLogger = Logger.getRootLogger()
rootLogger.setLevel(Level.ERROR)
var nonStringCols = ""
var stringCols = ""
for (field <- testDF.schema){
val fieldName = field.name.trim
val fieldDataType = field.dataType.typeName.trim
if(featureCols.contains(fieldName.toLowerCase())){
if(fieldDataType.equalsIgnoreCase("integer") || fieldDataType.equalsIgnoreCase("long") ||
fieldDataType.equalsIgnoreCase("DOUBLE") || fieldDataType.equalsIgnoreCase("FLOAT") ||
fieldDataType.equalsIgnoreCase("DECIMAL"))
nonStringCols +=fieldName+","
else if(fieldDataType.equalsIgnoreCase("string"))
stringCols +=fieldName+","
}
}
if(nonStringCols.isEmpty && stringCols.isEmpty){
throw new Exception("Check if Feature columns are empty")
}
val reqModel = Try {
KMeansModel.load(locationToSaveModelData)
}.getOrElse({
val kmeans = new KMeans().setK(noOfNodes).setMaxIter(noOfIter)
kmeans
})
//Build pipeline
val pipeline = new Pipeline().setStages(buildDataPrepPipeLine(sqlContext, labelCol, stringCols, nonStringCols) ++ Array(reqModel))
//Model generated and used to prepare data for model
val pipelineModel = pipeline.fit(trainDF)
//predict using model using the prepared data thru the pipeline
val testPredictions = pipelineModel.transform(testDF)
testPredictions.show()
}
}

Resources