I'm using spark streaming to ingest my company's internal data source. I followed this tutorial to write a receiver: https://spark.apache.org/docs/latest/streaming-custom-receivers.html. But in Spark UI streaming tag, I always see 0 msgs coming in. Also I don't see any errors in driver logs. Really confused what goes wrong. (To connect to the internal data source, need to create a client, then listen() will keep running to get the new msgs) I doubt is it because of the listen mode on the data source?
My Receiver
class MyReceiver(val clientId: String, val token: String, val env: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {
def onStart() {
new Thread("My Data Source") { override def run() { receive() } }.start()
}
def onStop() { }
private def receive() {
while(!isStopped()) {
try {
val client = new Client(clientId, token, "STAGE")
client.connect()
client.listen(Client.Topic, new ClientMsgHandler() {
override def process(event: ClientMsg): Unit = {
val msg: String = event.getBody
store(msg)
}
override def onException(event: ClientEvent): Unit = {
}
})
} catch {
case ce: java.net.ConnectException =>
System.out.println("Could not connect")
case t: Throwable =>
System.out.println("Error receiving data")
}
}
}
}
==================================================================
Create Stream
class MyStream(sc: SparkContext, sqlContext: SQLContext, cpDir: String) {
def creatingFunc(): StreamingContext = {
val ssc = new StreamingContext(sc, Seconds(3))
// Set the active SQLContext so that we can access it statically within the foreachRDD
SQLContext.setActiveSession(sqlContext)
ssc.checkpoint(cpDir)
val ClientId = <Myclientid>
val Token = <Mytoken>
val env = "STAGE"
val stream = ssc.receiverStream(new MyReceiver(ClientId, Token, env))
stream.foreachRDD { rdd => println("Here"+rdd.take(10).mkString(", "))
}
ssc
}
}
==================================================================
Start Streaming
val checkpoint_dir = <my_checkpoint_dir>
val MyDataSourceStream = new MyStream(sc, sqlContext, checkpoint_dir)
val ssc = StreamingContext.getActiveOrCreate(checkpoint_dir, MyDataSourceStream.creatingFunc _)
ssc.start()
ssc.awaitTermination()
Updates:
Since it's an internal source, I cannot share Client source code. But I've tested the connection. It works for below code and msg can be printed out correctly. You can think of Client as an external lib which has no connection issues.
val ClientId = <myclientid>
val Token = <mytoken>
val client = new EVClient(ClientId, Token, "STAGE")
client.connect()
client.listen(Client.Topic, new ClientMsgHandler() {
override def onEvent(event: ClientMsg): Unit = {
val res = event.getBody
println(res)
}
override def onException(event: ClientEvent): Unit = {
}
})
Related
We can use following api to write dataframe into local files.
df.write.parquet(path)
df.write.json(path)
However, Can I write into a parquet and a json in one time without compute the dataframe twice ?
By the way , I dont want to cache the data in memory, because it's too big.
If you don't cache/persist the dataframe, then it'll will need re-computed for each output format.
We can implement an org.apache.spark.sql.execution.datasources.FileFormat to do such thing.
DuplicateOutFormat demo
/**
* Very Dangerous Toy Code. DO NOT USE IN PRODUCTION.
*/
class DuplicateOutFormat
extends FileFormat
with DataSourceRegister
with Serializable {
override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = {
throw new UnsupportedOperationException()
}
override def prepareWrite(sparkSession: SparkSession,
job: Job,
options: Map[String, String],
dataSchema: StructType): OutputWriterFactory = {
val format1 = options("format1")
val format2 = options("format2")
val format1Instance = DataSource.lookupDataSource(format1, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val format2Instance = DataSource.lookupDataSource(format2, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val writerFactory1 = format1Instance.prepareWrite(sparkSession, job, options, dataSchema)
val writerFactory2 = format2Instance.prepareWrite(sparkSession, job, options, dataSchema)
new OutputWriterFactory {
override def getFileExtension(context: TaskAttemptContext): String = ".dup"
override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
val path1 = path.replace(".dup", writerFactory1.getFileExtension(context))
val path2 = path.replace(".dup", writerFactory2.getFileExtension(context))
val writer1 = writerFactory1.newInstance(path1, dataSchema, context)
val writer2 = writerFactory2.newInstance(path2, dataSchema, context)
new OutputWriter {
override def write(row: InternalRow): Unit = {
writer1.write(row)
writer2.write(row)
}
override def close(): Unit = {
writer1.close()
writer2.close()
}
}
}
}
}
override def shortName(): String = "dup"
}
SPI
we should make a SPI file /META-INF/services/org.apache.spark.sql.sources.DataSourceRegister, content:
com.github.sparkdemo.DuplicateOutFormat.
demo usage
class DuplicateOutFormatTest extends FunSuite {
val spark = SparkSession.builder()
.master("local")
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
test("testDuplicateWrite") {
val data = Array(
("k1", "fa", "20210901", 16),
("k2", null, "20210902", 15),
("k3", "df", "20210903", 14),
("k4", null, "20210904", 13)
)
val tempDir = System.getProperty("java.io.tmpdir") + "spark-dup-test" + System.nanoTime()
val df = sc.parallelize(data).toDF("k", "col2", "day", "col4")
df.write
.option("format1", "csv")
.option("format2", "orc")
.format("dup").save(tempDir)
df.show(1000, false)
}
}
WARNING
Spark SQL couple some sth in DataFrameWriter#saveToV1Source and other source code, that we can't change. This custom DuplicateOutFormat is just for demo, lacking of test. Full demo in github.
I have a dataframe let's say:
val someDF = Seq(
(8, "bat"),
(64, "mouse"),
(-27, "horse")
).toDF("number", "word")
I want to send that dataframe to a kafka topic using avro serialization and using schema registry. I believe I'm almost there, but I can't seem to get past the Task not serializable error. I understand there is a sink for kafka, but it doesn't communicate with the schema registry which is a requirement.
object Holder extends Serializable{
def prop(): java.util.Properties = {
val props = new Properties()
props.put("schema.registry.url", schemaRegistryURL)
props.put("key.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("value.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("schema.registry.url", schemaRegistryURL)
props.put("bootstrap.servers", brokers)
props
}
def vProps(props: java.util.Properties): kafka.utils.VerifiableProperties = {
val vProps = new kafka.utils.VerifiableProperties(props)
vProps
}
def messageSchema(vProps: kafka.utils.VerifiableProperties): org.apache.avro.Schema = {
val ser = new KafkaAvroEncoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(subjectValueName)
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
messageSchema
}
def avroRecord(messageSchema: org.apache.avro.Schema): org.apache.avro.generic.GenericData.Record = {
val avroRecord = new GenericData.Record(messageSchema)
avroRecord
}
def ProducerRecord(avroRecord:org.apache.avro.generic.GenericData.Record): org.apache.kafka.clients.producer.ProducerRecord[org.apache.avro.generic.GenericRecord,org.apache.avro.generic.GenericRecord] = {
val record = new ProducerRecord[GenericRecord, GenericRecord](topicWrite, avroRecord)
record
}
def producer(props: java.util.Properties): KafkaProducer[GenericRecord, GenericRecord] = {
val producer = new KafkaProducer[GenericRecord, GenericRecord](props)
producer
}
}
val prod: (String, String) => String = (
number: String,
word: String,
) => {
val prop = Holder.prop()
val vProps = Holder.vProps(prop)
val mSchema = Holder.messageSchema(vProps)
val aRecord = Holder.avroRecord(mSchema)
aRecord.put("number", number)
aRecord.put("word", word)
val record = Holder.ProducerRecord(aRecord)
val producer = Holder.producer(prop)
producer.send(record)
"sent"
}
val prodUDF: org.apache.spark.sql.expressions.UserDefinedFunction =
udf((
Number: String,
word: String,
) => prod(number,word))
val testDF = firstDF.withColumn("sent", prodUDF(col("number"), col("word")))
KafkaProducer is not serializable.
Create the KafkaProducer inside prod() instead of creating it outside.
object SparkMain extends App {
System.setProperty("spark.cassandra.connection.host", "127.0.0.1")
val conf = new SparkConf().setMaster("local[2]").setAppName("kafkaspark").set("spark.streaming.concurrentJobs","4")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(5))
val sqlContext= new SQLContext(sc)
val host = "localhost:2181"
val topicList = List("test","fb")
topicList.foreach{
topic=> val lines =KafkaUtils.createStream(ssc, host, topic, Map(topic -> 1)).map(_._2);
//configureStream(topic, lines)
lines.foreachRDD(rdd => rdd.map(test(_)).saveToCassandra("test","rawdata",SomeColumns("key")))
}
ssc.addStreamingListener(new StreamingListener {
override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
System.out.println("Batch completed, Total delay :" + batchCompleted.batchInfo.totalDelay.get.toString + " ms")
}
override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
println("inside onReceiverStarted")
}
override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = {
println("inside onReceiverError")
}
override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = {
println("inside onReceiverStopped")
}
override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
println("inside onBatchSubmitted")
}
override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = {
println("inside onBatchStarted")
}
})
ssc.start()
println("===========================")
ssc.awaitTermination()
}
case class test(key: String)
If I put any one of the topics at a time, then each topic works. But when topic list has more than one topic, after getting the DataStream from kafka topic, it keeps printing "inside onBatchSubmitted".
My Bad . I configured it wrong.
setMaster("local[*]") in place of setMaster("local[2]") works.
change local[2] to local[*] and its working fine.
val conf = new SparkConf().setMaster("local[*]").setAppName("kafkaspark").set("spark.streaming.concurrentJobs","4")
I have written custom receiver to receive the stream that is being generated by one of our application. The receiver starts the process gets the stream and then cals store. However, the receive method gets called multiple times, I have written proper loop break condition, but, could not do it. How to ensure it only reads once and does not read the already processed data.?
Here is my custom receiver code:
class MyReceiver() extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging {
def onStart() {
new Thread("Splunk Receiver") {
override def run() { receive() }
}.start()
}
def onStop() {
}
private def receive() {
try {
/* My Code to run a process and get the stream */
val reader = new ResultsReader(job.getResults()); // ResultReader is reader for the appication
var event:String = reader.getNextLine;
while (!isStopped || event != null) {
store(event);
event = reader.getNextLine;
}
reader.close()
} catch {
case t: Throwable =>
restart("Error receiving data", t)
}
}
}
Where did i go wrong.?
Problems
1) The job and stream reading happening after every 2 seconds and same data is piling up. So, for 60 line of data, i am getting 1800 or greater some times, in total.
Streaming Code:
val conf = new SparkConf
conf.setAppName("str1");
conf.setMaster("local[2]")
conf.set("spark.driver.allowMultipleContexts", "true");
val ssc = new StreamingContext(conf, Minutes(2));
val customReceiverStream = ssc.receiverStream(new MyReceiver)
println(" searching ");
//if(customReceiverStream.count() > 0 ){
customReceiverStream.foreachRDD(x => {println("=====>"+ x.count());x.count()});
//}
ssc.start();
ssc.awaitTermination()
Note: I am trying this in my local cluster, and with master as local[2].
The spark-streaming website at https://spark.apache.org/docs/latest/streaming-programming-guide.html#output-operations-on-dstreams mentions the following code:
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
// ConnectionPool is a static, lazily initialized pool of connections
val connection = ConnectionPool.getConnection()
partitionOfRecords.foreach(record => connection.send(record))
ConnectionPool.returnConnection(connection) // return to the pool for future reuse
}
}
I have tried to implement this using org.apache.commons.pool2 but running the application fails with the expected java.io.NotSerializableException:
15/05/26 08:06:21 ERROR OneForOneStrategy: org.apache.commons.pool2.impl.GenericObjectPool
java.io.NotSerializableException: org.apache.commons.pool2.impl.GenericObjectPool
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1184)
...
I am wondering how realistic it is to implement a connection pool that is serializable. Has anyone succeeded in doing this ?
Thank you.
To address this "local resource" problem what's needed is a singleton object - i.e. an object that's warranted to be instantiated once and only once in the JVM. Luckily, Scala object provides this functionality out of the box.
The second thing to consider is that this singleton will provide a service to all tasks running on the same JVM where it's hosted, so, it MUST take care of concurrency and resource management.
Let's try to sketch(*) such service:
class ManagedSocket(private val pool: ObjectPool, val socket:Socket) {
def release() = pool.returnObject(socket)
}
// singleton object
object SocketPool {
var hostPortPool:Map[(String, Int),ObjectPool] = Map()
sys.addShutdownHook{
hostPortPool.values.foreach{ // terminate each pool }
}
// factory method
def apply(host:String, port:String): ManagedSocket = {
val pool = hostPortPool.getOrElse{(host,port), {
val p = ??? // create new pool for (host, port)
hostPortPool += (host,port) -> p
p
}
new ManagedSocket(pool, pool.borrowObject)
}
}
Then usage becomes:
val host = ???
val port = ???
stream.foreachRDD { rdd =>
rdd.foreachPartition { partition =>
val mSocket = SocketPool(host, port)
partition.foreach{elem =>
val os = mSocket.socket.getOutputStream()
// do stuff with os + elem
}
mSocket.release()
}
}
I'm assuming that the GenericObjectPool used in the question is taking care of concurrency. Otherwise, access to each pool instance need to be guarded with some form of synchronization.
(*) code provided to illustrate the idea on how to design such object - needs additional effort to be converted into a working version.
Below answer is wrong!
I'm leaving the answer here for reference, but the answer is wrong for the following reason. socketPool is declared as a lazy val so it will get instantiated with each first request for access. Since the SocketPool case class is not Serializable, this means that it will get instantiated within each partition. Which makes the connection pool useless because we want to keep connections across partitions and RDDs. It makes no difference wether this is implemented as a companion object or as a case class. Bottom line is: the connection pool must be Serializable, and apache commons pool is not.
import java.io.PrintStream
import java.net.Socket
import org.apache.commons.pool2.{PooledObject, BasePooledObjectFactory}
import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericObjectPool}
import org.apache.spark.streaming.dstream.DStream
/**
* Publish a Spark stream to a socket.
*/
class PooledSocketStreamPublisher[T](host: String, port: Int)
extends Serializable {
lazy val socketPool = SocketPool(host, port)
/**
* Publish the stream to a socket.
*/
def publishStream(stream: DStream[T], callback: (T) => String) = {
stream.foreachRDD { rdd =>
rdd.foreachPartition { partition =>
val socket = socketPool.getSocket
val out = new PrintStream(socket.getOutputStream)
partition.foreach { event =>
val text : String = callback(event)
out.println(text)
out.flush()
}
out.close()
socketPool.returnSocket(socket)
}
}
}
}
class SocketFactory(host: String, port: Int) extends BasePooledObjectFactory[Socket] {
def create(): Socket = {
new Socket(host, port)
}
def wrap(socket: Socket): PooledObject[Socket] = {
new DefaultPooledObject[Socket](socket)
}
}
case class SocketPool(host: String, port: Int) {
val socketPool = new GenericObjectPool[Socket](new SocketFactory(host, port))
def getSocket: Socket = {
socketPool.borrowObject
}
def returnSocket(socket: Socket) = {
socketPool.returnObject(socket)
}
}
which you can invoke as follows:
val socketStreamPublisher = new PooledSocketStreamPublisher[MyEvent](host = "10.10.30.101", port = 29009)
socketStreamPublisher.publishStream(myEventStream, (e: MyEvent) => Json.stringify(Json.toJson(e)))