1. I use a custom sparkSQL plugin to start a spark-shell terminal and execute the following command in it
import org.apache.spark.sql.{DataFrame, DataFrameReader, Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
sc.setLogLevel("WARN")
val spark: SparkSession = SparkSession.builder().appName("Test").config("parquet.enable.dictionary","false").getOrCreate()
val res = spark.sql("SELECT USER_ID FROM TEST_PARQUET_10G where USER_ID >= 0 and HOUR_ID > 0")
res.collect()
2. This is my table creation statement
CREATE
TABLE
TEST_PARQUET_10G(
USER_ID BIGINT,
SERIAL_NUMBER BIGINT,
KAFKA_TIME BIGINT,
ACCT_DATE BIGINT,
HOUR_ID BIGINT
)
COMMENT 'Long类型'
STORED AS PARQUET
TBLPROPERTIES('parquet.compression'='none')
LOCATION
'/user/hive/warehouse/hanlei/Long_PARQUET/TEST_PARQUET_10G'
3. I customized a sql plug-in, in which I rewritten the rules for reading paquet files this is my reader, in this step, I set USER_ID ADN HOUR_ID in my reader
this is my reader code
private def createParquetFileReader(file: PartitionedFile): ParquetFileReader = {
val conf = broadcastedConf.value.value
val filePath = new Path(file.filePath)
val split = new FileSplit(filePath, file.start, file.length, Array.empty[String])
val reader = ParquetFileReader.open(HadoopInputFile.fromPath(filePath,conf))
val requiredNameArr = readDataSchema.fieldNames
val requiredSchemaName:Set[String] = requiredNameArr.toSet
val fields: util.List[Type] = reader.getFileMetaData.getSchema.getFields
fields.removeIf{
field =>
!requiredSchemaName.contains(field.getName)
}
requiredSchema = new MessageType("requiredSchema", fields)
reader.setRequestedSchema(requiredSchema)
reader
}
4. But when I call the readNextRowGroup method of the reader, the program reports this error
code from here
class ParquetDataPagesPartitionReader(
parquetReader: ParquetFileReader,
readDataSchema: StructType,
requiredSchema: MessageType
) extends PartitionReader[ColumnarBatch]
with ScanWithMetrics
with Arm
with Logging {
private var batch: Option[ColumnarBatch] = None
//private val columns: Iterator[String] = readDataSchema.fieldNames.iterator
var rowGroup: PageReadStore = _
val columns: util.List[ColumnDescriptor] = requiredSchema.getColumns
override def next(): Boolean = {
batch.foreach(_.close())
rowGroup = parquetReader.readNextRowGroup()
batch = if ( rowGroup != null) {
readBatch()
} else {
None
}
batch.isDefined
}
// one batch is all columnChunk's dataPages from a rowGroup
private def readBatch(): Option[ColumnarBatch] = {
val starTime = System.currentTimeMillis()
// val starTime = System.nanoTime()
logError(s"startTime ${starTime}")
val dpuColumnVectors: Array[DpuColumnVector] = new Array[DpuColumnVector](columns.size())
for (i <- 0 until readDataSchema.length) {
val raceDType = DpuColumnVector.getRaceDataType(readDataSchema(i).dataType)
val raceVec = new RaceColumnVector(raceDType, DpuBatchUtils.DPU_MAX_ROWS)
logDebug(s"try to malloc ${i} column")
raceVec.setRowSizeAndMalloc()
logDebug(s"successful malloc ${i} column")
dpuColumnVectors(i) = new DpuColumnVector(readDataSchema(i).dataType, raceVec)
}
var useTime: Long = 0
for (index <- 0 until readDataSchema.length) {
var offset: Int = 0
val pageReader = rowGroup.getPageReader(columns.get(index))
var page: DataPageV1 = pageReader.readPage.asInstanceOf[DataPageV1]
logDebug(s"appending data to column ${index}")
while (page != null && offset < DpuBatchUtils.DPU_MAX_ROWS) {
dpuColumnVectors(index).dataType() match {
case LongType =>
val startReadingIntoMemory = System.currentTimeMillis()
val byteArray: Array[Byte] = page.getBytes.toByteArray
val bytes: Array[Byte] = byteArray.slice(8, byteArray.length)
val endReadIntoMemory = System.currentTimeMillis()
//logError(s"successful read into memory column: ${columns.get(index).getPath.apply(0)} use time: ${endReadIntoMemory - startReadingIntoMemory}")
dpuColumnVectors(index).getRaceColumnVector
// todo: The byte array here is not decoded
.appendValuesFromBuffer(bytes, 0, page.getValueCount)
val endAppendToRace = System.currentTimeMillis()
//logError(s"successful read into race, appended ${page.getValueCount} rows from ${offset} use time ${endAppendToRace - endReadIntoMemory}")
case _ => throw new RuntimeException(RaceConstant.UNSUPPORTED_DATA_TYPE)
}
offset += page.getValueCount
page = pageReader.readPage.asInstanceOf[DataPageV1]
}
}
val endTime = System.currentTimeMillis()
// val endTime = System.nanoTime()
// useTime = useTime + endTime - starTime
val columnarBatch = new ColumnarBatch(dpuColumnVectors.toArray, rowGroup.getRowCount.toInt)
logError(s"endTime: ${endTime}")
logError(s"read batch success, write to race successful,batchRows: ${rowGroup.getRowCount}, column number: ${columns.size()}, use time: ${endTime - starTime}")
Some(columnarBatch)
}
override def get(): ColumnarBatch = {
val ret = batch.getOrElse(throw new NoSuchElementException)
batch = None
ret
}
override def close(): Unit = {
if (parquetReader != null) parquetReader.close()
batch.foreach(_.close())
batch = None
}
}
5. In this reader, I have the schema I injected, but the subscript is mapped according to the original schema
code injected HOUR_ID and USER_ID
6. Here is my call stack info
stack info
How do I read parquet's columnChunk through a custom mapping?
I have a HDFS location and there is a zip file inside that location
HDFS location /development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/records.zip
scala> val loc = "/development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/"
loc: String = "/development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/"
scala> val rdd = sc.textFile(loc)
rdd: org.apache.spark.rdd.RDD[String] = /development/staging/b8baf3f4-abce-11eb-8592-0242ac110032/ MapPartitionsRDD[1] at textFile at <console>:26
scala> rdd.take(2)
res0: Array[String] = Array(PK????????]R�R��*�????�??? ???2972120.dat�S�r�0?
��*�0����?t?�]T�Ж??����
`�6ط�kU;P�M�� rSO�;G��p��?��?�Z1^3#�^�� ��F��ٕb�?~,ٖ
�u6�D��'�#�??��L*�Gp?�kcL�7!r�p1�1e�� a*.{?
�.;��������s�(�)�, ?�=�9U<"*!?5��?;�?�?�مd{h}
��gG���� �?�Z)
but it produces output differently
Can you help on how do i read a file inside a zip file using spark RDD There is only one file inside my zip file
Are you looking for something like this :
import java.io.{ IOException, FileOutputStream, FileInputStream, File }
import java.util.zip.{ ZipEntry, ZipInputStream }
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkContext
//Unzip the file and copy the internal contents outside in new location
object Unzip extends App {
val INPUT_ZIP_FILE: String = "src/resources/my-zip.zip";
val OUTPUT_FOLDER: String = "src/resources/my-zip";
def unZipIt(zipFile: String, outputFolder: String): Unit = {
val buffer = new Array[Byte](1024)
try {
//output directory
val folder = new File(OUTPUT_FOLDER);
if (!folder.exists()) {
folder.mkdir();
}
//zip file content
val zis: ZipInputStream = new ZipInputStream(new FileInputStream(zipFile));
//get the zipped file list entry
var ze: ZipEntry = zis.getNextEntry();
while (ze != null) {
val fileName = ze.getName();
val newFile = new File(outputFolder + File.separator + fileName);
System.out.println("file unzip : " + newFile.getAbsoluteFile());
//create folders
new File(newFile.getParent()).mkdirs();
val fos = new FileOutputStream(newFile);
var len: Int = zis.read(buffer);
while (len > 0) {
fos.write(buffer, 0, len)
len = zis.read(buffer)
}
fos.close()
ze = zis.getNextEntry()
}
zis.closeEntry()
zis.close()
} catch {
case e: IOException => println("exception caught: " + e.getMessage)
}
}
Unzip.unZipIt(INPUT_ZIP_FILE, OUTPUT_FOLDER)
val sac = new SparkContext("local[*]", " first Program");
val sqlc = new SQLContext(sac);
val rdd = sac.textFile("src/resources/my-zip/sample.txt")
rdd.take(1).foreach(println)
/*val rddFromFile = sqlc.sparkContext.textFile("src/resources/my-zip/sample.txt")
println(rddFromFile.getClass)
println("##Get data Using collect")
rddFromFile.collect().foreach(f=>{
println(f)
})*/
}
Not sure if this achieves what you want to do, but may be could help a bit!
I'm trying to read multiple xls files that are in a .zip file using the code below. I'm getting missed end of block error. Do I need to add a EOF kind of character at the end of the byte array before I send it to poi?
The code below:
val zipStream=new ZipInputStream(inputStream)
var zipEntry = null
while(zipEntry = zipStream.getNextEntry != null){
val bytes=new Array[Byte](zipEntry.getSize.toInt)
zipStream.readBytes(bytes)
val xlsByteStream = new ByteArrayInputStream(bytes)
val workbook = new XSSFWorkbook(xlsByteStream)
}
val zipStream=new ZipInputStream(inputStream)
var zipEntry = null
while({zipEntry = zipStream.getNextEntry; zipEntry != null})
{
val bytes=new Array[Byte](1024)
val bos = new ByteArrayOuputStream(zipEntry.getSize.toInt)
while({i=zipStream.read(bytes);i>0}) {
bos.write(bytes, 0, i)
}
bos.close
val xlsByteStream = new ByteArrayInputStream(bos.toByteArray)
val workbook = new XSSFWorkbook(xlsByteStream)
}
I am trying to write an updateStateWithKey for getting the first values on an input. When I try to use it on a case class, I get an error
import org.apache.log4j.{Level, Logger}
import org.apache.spark._
import org.apache.spark.streaming._
case class Persons(name : String, school : String)
object StatefulNetworkWordCount {
def getPerson (str : String) : Persons = {
val splitArray = str.split(",")
val name = splitArray(0)
val school = splitArray(1)
Persons(name, school)
}
//Now, newValues is the new set of values
//runningCount is the existing values for each key
def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
val newCount = runningCount.getOrElse(0) + newValues.sum
Some(newCount)
}
def updateFunctionFrist(newValues: Seq[String], runningCount: Option[String]): Option[String] =
{
val newWord = if (runningCount.getOrElse("") == "")
{
val str = newValues.head.toString //Use existing values
Some(str)
}
else
{
val str = runningCount.getOrElse(newValues.head.toString)
Some(str)
}
newWord
}
def updateFunctionFirstPerson(newValues: Seq[Person], state: Option[Person]): Option[Person] =
{
val newWord = if (state.getOrElse("") == "") //If running count is empty
{
val str = newValues.head.asInstanceOf[Person]
Some(str)
}
else
{
val str = state.getOrElse(newValues.head.asInstanceOf[Person])
Some(str)
}
newWord
}
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setMaster("local[8]").setAppName("StatefulNetworkWordCount")
val ssc = new StreamingContext(conf, Seconds(10))
// Set checkpoint directory
ssc.checkpoint(".")
// Create a DStream that will connect to hostname:port, like localhost:9999
val lines = ssc.socketTextStream("localhost", 9999)
// Split each line into words
val words = lines.flatMap(_.split(" "))
// Count each word in each batch
val pairs = words.map(word => (word.hashCode, word))
val runningCounts = pairs.updateStateByKey[Persons](updateFunctionFirstPerson _)
runningCounts.print()
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
The line
val runningCounts = pairs.updateStateByKey[Persons](updateFunctionFirstPerson _)
throws an error, but if I use the
val runningCounts = pairs.updateStateByKey[String](updateFunctionFirst _)
to get the first value for a key it works fine. Can we use custom class in updateStateBykey? How can i use it?
I am newbie to Spark ML. I am looking for Streaming ml example with raw data as input (I mean raw String delimited data, not the vectorized data).
I tried to look at most of the forums to find the similar example, I couldn't find any.
So, I approached the following way for Streaming Kmeans on Spark 1.6 (Streaming Kmeans still works on vector data rather than dataframes),
But I am not sure, if this is the right way.
Input Dstream records from Kafka, I am converting that to Dataframe.
Built a pre data pipeline to read the columns I need to convert that to vectors.
Since in every Stream batch, data is different and Vector length may differ, So, I am using Tokenizer and HashingTF to maintain constant vector length.
Also in each Stream batch, to identify the vector belongs to which row, I have assigned the row number and sending that to Streaming Kmeans algorithm
And after getting the cluster ID for each row, I am joining the cluster ID to the row data to get the final predicted dataframe.
Now I have doubts in the way I implemented this.
To my understanding, using the HashingTF with fixed vector length (setNumFeatures) will not solve my issue, the reason being the same vectors may repeat again, if the combination of vector is less than the number of rows in a single batch (hard-coded it to 200 right now in my below code)
Also I tried StringIndexer with OneHotEncoder for generation of vectors, but I see vector length is differing for each batch.
Since we have way a head assign RandomCenters for StreamingKMeans, I have to know the vector length prior to my execution (for my testing data and columns, since I getting vector length as 800, I hard-coded it for now for testing)
Though we have to send training vectors with all combinations of data separately for train the model, right now I am using both the training and testing vectors as same.
Can someone guide me or share me some streaming ml example with raw data as input ?
import com.common.Configuration._
import com.twitter.bijection.Injection
import com.twitter.bijection.avro.GenericAvroCodecs
import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import scala.util.Try;
object KafkaDataConsumer {
//localOrCluster=l streamBatchSeconds=2 locationToSaveStream=tmp3 isAvroTopic=text hiveTableName= modelName=streamkmeans hiveORFile=file locationToSaveModelData=tmp1 predictedDataLocation=tmp2 labelCol=salary nonStringCols= stringCols=workclass,age,education_num,hours_per_week csvOrHiveForTrain=data/mllib/adult.csv csvOrHiveForTest=data/mllib/adult.csv noOfNodes=10 noOfIter=10 locationToStorePmml=pmmlfolder runMLPipeline=true isOverwriteDataOk=no
// Setting the logs levels
setLogLevels(Level.WARN, Seq("spark", "org", "akka"))
def main(args: Array[String]) {
executeStreamdata(args)
}
def executeStreamdata(args: Array[String]) {
val namedArgs = getNamedArgs(args)
val localOrCluster = namedArgs("localOrCluster")
val streamBatchSeconds = namedArgs("streamBatchSeconds").toInt
val isAvroTopic = namedArgs("isAvroTopic").equalsIgnoreCase("avro")
val kafkaBrokerList = kafkaConfig.getString("kafkaBrokerList")
val kafkaTopicList =
if (isAvroTopic)
kafkaConfig.getString("TOPIC1_NAME")
else
kafkaConfig.getString("TOPIC2_NAME")
val SCHEMA =
if (isAvroTopic)
kafkaConfig.getString("TOPIC1_SCHEMA")
else
kafkaConfig.getString("TOPIC2_SCHEMA")
val labelCol = namedArgs("labelCol")
val nonStringCols = namedArgs("nonStringCols")
val stringCols = namedArgs("stringCols")
val noOfNodes = namedArgs("noOfNodes").toInt
val predictedDataLocation = namedArgs("predictedDataLocation")
val sc = getSparkContext(localOrCluster, "kafka stream application")
val ssc = new StreamingContext(sc, Seconds(streamBatchSeconds))
val sqlContext = new SQLContext(sc)
val topicsSet = kafkaTopicList.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> kafkaBrokerList)
//Read Stream records either as AVRO or String
val dStreamRecords =
if (isAvroTopic) {
val parser = new Schema.Parser
val schema = parser.parse(SCHEMA)
val recordInjection: Injection[GenericRecord, Array[Byte]] = GenericAvroCodecs.toBinary(schema)
KafkaUtils.createDirectStream[String, Array[Byte], StringDecoder, DefaultDecoder](ssc, kafkaParams, topicsSet)
.map(message => recordInjection.invert(message._2).get.toString)
}
else {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet).map(_._2)
}
println("Executing Streaming KMeans")
//Pipeline for generating the label, features vector using StringTokenizer and HashFunction based on input label Columns, string and nonString Columns
val pipelineStagesWithAssembler = buildDataPrepPipeLine(sqlContext, labelCol, stringCols, nonStringCols, true)
val pipeline = new Pipeline().setStages(pipelineStagesWithAssembler)
var vectorsDFSchema:StructType = null
def createDFFromDStream(rdd: RDD[String]): DataFrame = {
if (isAvroTopic)
sqlContext.read.json(rdd)
else
sqlContext.createDataFrame(rdd.map(x => Row.fromSeq(x.split(",").toSeq)), createSchemaFrmStrDelValues(SCHEMA))
}
val vectorDataWithRowNum = dStreamRecords.transform(rdd=>{
//Create dataframe from the input data
val dataframe = createDFFromDStream(rdd)
//Get the pipeline model
val pipelineModel = pipeline.fit(dataframe)
//Get label, features vectors in the DataF
val vectorsDF = pipelineModel.transform(dataframe)
//get the row number for the dataframe
val vectorsDFWithRowNum = dfWithRowIndexUsingRDD(vectorsDF)
vectorsDFSchema = StructType(vectorsDFWithRowNum.schema.filter(!_.dataType.toString.contains("Vector")))
//Get row number, vectors , row data
val reqData = vectorsDFWithRowNum.map(row => {
(row.getAs[Long]("row_num"), row.getAs[org.apache.spark.mllib.linalg.Vector]("features"), row)
})
reqData
})
// get train data(vector) , test data (row number or index, vector), row data (row number or index, row data)
val (dStreamTrainVector, dStreamTestVector, rowData) = (vectorDataWithRowNum.map(_._2), vectorDataWithRowNum.map(data=>(data._1, data._2)), vectorDataWithRowNum.map(data=>(data._1, data._3)))
// Need to Fix the Random Centers hardcoded 800
val model = new StreamingKMeans()
.setK(noOfNodes)
.setDecayFactor(1.0)
.setRandomCenters(800, 0)
// Train the Kmeans model
model.trainOn(dStreamTrainVector)
// Predict vales from the model
val joinData = model.predictOnValues(dStreamTestVector).join(rowData)
joinData.foreachRDD(rdd=> {
val seqData = rdd.map(x=>{
val id = x._1
val clusterID = x._2._1
val rowData = x._2._2.toSeq.filter(!_.isInstanceOf[org.apache.spark.mllib.linalg.SparseVector])
Row.fromSeq(rowData :+ clusterID.toString)
})
val dfPredictions = sqlContext.createDataFrame(seqData, vectorsDFSchema.add("clusterID", StringType))
dfPredictions.printSchema()
dfPredictions.show()
dfPredictions.write.mode(SaveMode.Append).save(predictedDataLocation)
})
ssc.start
ssc.awaitTermination
}
private def buildOneHotPipeLine(colName:String, isStreamJob:Boolean = false):Array[PipelineStage] = {
if(isStreamJob){
val tokenizer = new org.apache.spark.ml.feature.Tokenizer()
.setInputCol(s"${colName}")
.setOutputCol(s"${colName}_token")
val hashingTF = new org.apache.spark.ml.feature.HashingTF()
.setInputCol(s"${colName}_token")
.setOutputCol(s"${colName}_hashFeature").setNumFeatures(200)
Array(tokenizer, hashingTF)
}else {
val stringIndexer = new StringIndexer()
.setInputCol(s"$colName")
.setOutputCol(s"${colName}_index")
val oneHotEncoder = new OneHotEncoder()
.setInputCol(s"${colName}")
.setOutputCol(s"${colName}_onehotindex")
Array(stringIndexer, oneHotEncoder)
}
}
def buildDataPrepPipeLine(sqlContext: SQLContext, lableCol: String, stringCols: String, nonStringCols: String, isStreamJob:Boolean = false):Array[PipelineStage] = {
var pipelineStagesforFeatures : Array[PipelineStage]= null
var assemblerInputCols :Array[String] = null
if(stringCols != null && !stringCols.isEmpty ){
val stringColsArray = stringCols.split(",")
if(pipelineStagesforFeatures != null) {
pipelineStagesforFeatures = pipelineStagesforFeatures ++ stringColsArray.map(columnName => buildOneHotPipeLine(columnName, isStreamJob)).reduce(_ ++ _)
}else{
pipelineStagesforFeatures = stringColsArray.map(columnName => buildOneHotPipeLine(columnName, isStreamJob)).reduce(_++_)
}
if(assemblerInputCols != null) {
assemblerInputCols = assemblerInputCols ++ stringColsArray.map(colName => if(isStreamJob) s"${colName}_hashFeature" else s"${colName}_onehotindex")
}else{
assemblerInputCols = stringColsArray.map(colName => if(isStreamJob) s"${colName}_hashFeature" else s"${colName}_onehotindex")
}
}
if(nonStringCols != null && !nonStringCols.isEmpty ){
val nonStringColsArray = nonStringCols.split(",")
if(assemblerInputCols != null) {
assemblerInputCols = assemblerInputCols ++ nonStringColsArray
}else{
assemblerInputCols = nonStringColsArray
}
}
// Combine all the features and make it a single Feature
val assembler = new VectorAssembler()
.setInputCols(assemblerInputCols)
.setOutputCol("features")
var labelIndexer : StringIndexer= null
if(lableCol != null && !lableCol.isEmpty){
labelIndexer = new StringIndexer()
labelIndexer.setInputCol(lableCol)
labelIndexer.setOutputCol("label")
}
val pipelineStagesWithAssembler =
if(stringCols != null && !stringCols.isEmpty &&
nonStringCols != null && !nonStringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty) {
pipelineStagesforFeatures.toList ::: List(assembler,labelIndexer)
}else if( stringCols != null && !stringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler,labelIndexer)
} else if( nonStringCols != null && !nonStringCols.isEmpty &&
lableCol != null && !lableCol.isEmpty){
List(assembler,labelIndexer)
}else if( stringCols != null && !stringCols.isEmpty &&
nonStringCols != null && !nonStringCols.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler)
}else if( stringCols != null && !stringCols.isEmpty){
pipelineStagesforFeatures.toList ::: List(assembler)
}else{
List(assembler)
}
pipelineStagesWithAssembler.toArray
}
def dfWithRowIndexUsingRDD(df: DataFrame, offset: Int = 1, colName: String = "row_num", inFront: Boolean = true): DataFrame = {
df.sqlContext.createDataFrame(
df.rdd.zipWithIndex.map(ln =>
Row.fromSeq(
(if (inFront) Seq(ln._2 + offset) else Seq())
++ ln._1.toSeq ++
(if (inFront) Seq() else Seq(ln._2 + offset))
)
),
StructType(
(if (inFront) Array(StructField(colName, LongType, false)) else Array[StructField]())
++ df.schema.fields ++
(if (inFront) Array[StructField]() else Array(StructField(colName, LongType, false)))
)
)
}
def getSparkContext(runLocal: String, appName: String) = {
val sc: SparkContext = if (runLocal.equalsIgnoreCase("local") || runLocal.equalsIgnoreCase("l")) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local[1]", appName, sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName(appName)
new SparkContext(sparkConfig)
}
sc.hadoopConfiguration.setBoolean("parquet.enable.summary-metadata", false)
sc
}
def createSchemaFrmStrDelValues(baseSchema: String): StructType = {
return StructType(baseSchema.split(",").map(f => StructField(f, StringType, true)))
}
def getNamedArgs(args: Array[String]): Map[String, String] = {
println("################### Input parameters are ############### " + args.toList)
args.filter(line => line.contains("=")) //take only named arguments
.map(x => {
val key = x.substring(0, x.indexOf("="))
val value = x.substring(x.indexOf("=") + 1)
(key, if (value == null || "".equalsIgnoreCase(value)) null else value)
}).toMap //convert to a map
}
def setLogLevels(level: Level, loggers: Seq[String]): Map[String, Level] = loggers.map(loggerName => {
val logger = Logger.getLogger(loggerName)
val prevLevel = logger.getLevel
logger.setLevel(level)
loggerName -> prevLevel
}).toMap
def pullDataFromCSVFile(sqlContext: SQLContext, isHeaderExist: Boolean, filePath: String, delimiter: String, csvSplit: String): DataFrame = {
var csvDataFrame: DataFrame = null
try {
if (isHeaderExist) {
csvDataFrame = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load(filePath)
} else {
if (csvSplit != null) {
val schema = createSchemaFrmStrDelValues(csvSplit)
csvDataFrame = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "false")
.option("delimiter", delimiter)
.option("inferSchema", "false")
.schema(schema)
.load(filePath)
}
}
} catch {
case ex: Exception => {
println("Unable to read the CSV file from the location " + filePath)
ex.printStackTrace()
throw ex
}
}
csvDataFrame
}
def executeBatch(args: Array[String]) {
val namedArgs = getNamedArgs(args)
val runLocal = namedArgs("clusterORLocal")
val sc: SparkContext = getSparkContext(runLocal, "Ml Pipeline")
val csvOrHiveForTrain = namedArgs("csvOrHiveForTrain")
val csvOrHiveForTest = namedArgs("csvOrHiveForTest")
val locationToSaveModelData = namedArgs("locationToSaveModelData")
val labelCol = namedArgs("labelCol")
val featureCols = namedArgs("featureCols").toString.split(",").map(_.toLowerCase)
val noOfNodes = namedArgs("noOfNodes").toInt
val noOfIter = namedArgs("noOfIter").toInt
val sqlContext = new SQLContext(sc);
val trainDF = pullDataFromCSVFile(sqlContext, true, csvOrHiveForTrain, null, null)
trainDF.printSchema()
val testDF = pullDataFromCSVFile(sqlContext, true, csvOrHiveForTest, null, null)
testDF.printSchema()
val rootLogger = Logger.getRootLogger()
rootLogger.setLevel(Level.ERROR)
var nonStringCols = ""
var stringCols = ""
for (field <- testDF.schema){
val fieldName = field.name.trim
val fieldDataType = field.dataType.typeName.trim
if(featureCols.contains(fieldName.toLowerCase())){
if(fieldDataType.equalsIgnoreCase("integer") || fieldDataType.equalsIgnoreCase("long") ||
fieldDataType.equalsIgnoreCase("DOUBLE") || fieldDataType.equalsIgnoreCase("FLOAT") ||
fieldDataType.equalsIgnoreCase("DECIMAL"))
nonStringCols +=fieldName+","
else if(fieldDataType.equalsIgnoreCase("string"))
stringCols +=fieldName+","
}
}
if(nonStringCols.isEmpty && stringCols.isEmpty){
throw new Exception("Check if Feature columns are empty")
}
val reqModel = Try {
KMeansModel.load(locationToSaveModelData)
}.getOrElse({
val kmeans = new KMeans().setK(noOfNodes).setMaxIter(noOfIter)
kmeans
})
//Build pipeline
val pipeline = new Pipeline().setStages(buildDataPrepPipeLine(sqlContext, labelCol, stringCols, nonStringCols) ++ Array(reqModel))
//Model generated and used to prepare data for model
val pipelineModel = pipeline.fit(trainDF)
//predict using model using the prepared data thru the pipeline
val testPredictions = pipelineModel.transform(testDF)
testPredictions.show()
}
}