object SparkMain extends App {
System.setProperty("spark.cassandra.connection.host", "127.0.0.1")
val conf = new SparkConf().setMaster("local[2]").setAppName("kafkaspark").set("spark.streaming.concurrentJobs","4")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(5))
val sqlContext= new SQLContext(sc)
val host = "localhost:2181"
val topicList = List("test","fb")
topicList.foreach{
topic=> val lines =KafkaUtils.createStream(ssc, host, topic, Map(topic -> 1)).map(_._2);
//configureStream(topic, lines)
lines.foreachRDD(rdd => rdd.map(test(_)).saveToCassandra("test","rawdata",SomeColumns("key")))
}
ssc.addStreamingListener(new StreamingListener {
override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
System.out.println("Batch completed, Total delay :" + batchCompleted.batchInfo.totalDelay.get.toString + " ms")
}
override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
println("inside onReceiverStarted")
}
override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = {
println("inside onReceiverError")
}
override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = {
println("inside onReceiverStopped")
}
override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
println("inside onBatchSubmitted")
}
override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = {
println("inside onBatchStarted")
}
})
ssc.start()
println("===========================")
ssc.awaitTermination()
}
case class test(key: String)
If I put any one of the topics at a time, then each topic works. But when topic list has more than one topic, after getting the DataStream from kafka topic, it keeps printing "inside onBatchSubmitted".
My Bad . I configured it wrong.
setMaster("local[*]") in place of setMaster("local[2]") works.
change local[2] to local[*] and its working fine.
val conf = new SparkConf().setMaster("local[*]").setAppName("kafkaspark").set("spark.streaming.concurrentJobs","4")
Related
We can use following api to write dataframe into local files.
df.write.parquet(path)
df.write.json(path)
However, Can I write into a parquet and a json in one time without compute the dataframe twice ?
By the way , I dont want to cache the data in memory, because it's too big.
If you don't cache/persist the dataframe, then it'll will need re-computed for each output format.
We can implement an org.apache.spark.sql.execution.datasources.FileFormat to do such thing.
DuplicateOutFormat demo
/**
* Very Dangerous Toy Code. DO NOT USE IN PRODUCTION.
*/
class DuplicateOutFormat
extends FileFormat
with DataSourceRegister
with Serializable {
override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = {
throw new UnsupportedOperationException()
}
override def prepareWrite(sparkSession: SparkSession,
job: Job,
options: Map[String, String],
dataSchema: StructType): OutputWriterFactory = {
val format1 = options("format1")
val format2 = options("format2")
val format1Instance = DataSource.lookupDataSource(format1, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val format2Instance = DataSource.lookupDataSource(format2, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val writerFactory1 = format1Instance.prepareWrite(sparkSession, job, options, dataSchema)
val writerFactory2 = format2Instance.prepareWrite(sparkSession, job, options, dataSchema)
new OutputWriterFactory {
override def getFileExtension(context: TaskAttemptContext): String = ".dup"
override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
val path1 = path.replace(".dup", writerFactory1.getFileExtension(context))
val path2 = path.replace(".dup", writerFactory2.getFileExtension(context))
val writer1 = writerFactory1.newInstance(path1, dataSchema, context)
val writer2 = writerFactory2.newInstance(path2, dataSchema, context)
new OutputWriter {
override def write(row: InternalRow): Unit = {
writer1.write(row)
writer2.write(row)
}
override def close(): Unit = {
writer1.close()
writer2.close()
}
}
}
}
}
override def shortName(): String = "dup"
}
SPI
we should make a SPI file /META-INF/services/org.apache.spark.sql.sources.DataSourceRegister, content:
com.github.sparkdemo.DuplicateOutFormat.
demo usage
class DuplicateOutFormatTest extends FunSuite {
val spark = SparkSession.builder()
.master("local")
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
test("testDuplicateWrite") {
val data = Array(
("k1", "fa", "20210901", 16),
("k2", null, "20210902", 15),
("k3", "df", "20210903", 14),
("k4", null, "20210904", 13)
)
val tempDir = System.getProperty("java.io.tmpdir") + "spark-dup-test" + System.nanoTime()
val df = sc.parallelize(data).toDF("k", "col2", "day", "col4")
df.write
.option("format1", "csv")
.option("format2", "orc")
.format("dup").save(tempDir)
df.show(1000, false)
}
}
WARNING
Spark SQL couple some sth in DataFrameWriter#saveToV1Source and other source code, that we can't change. This custom DuplicateOutFormat is just for demo, lacking of test. Full demo in github.
I'm using spark streaming to ingest my company's internal data source. I followed this tutorial to write a receiver: https://spark.apache.org/docs/latest/streaming-custom-receivers.html. But in Spark UI streaming tag, I always see 0 msgs coming in. Also I don't see any errors in driver logs. Really confused what goes wrong. (To connect to the internal data source, need to create a client, then listen() will keep running to get the new msgs) I doubt is it because of the listen mode on the data source?
My Receiver
class MyReceiver(val clientId: String, val token: String, val env: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {
def onStart() {
new Thread("My Data Source") { override def run() { receive() } }.start()
}
def onStop() { }
private def receive() {
while(!isStopped()) {
try {
val client = new Client(clientId, token, "STAGE")
client.connect()
client.listen(Client.Topic, new ClientMsgHandler() {
override def process(event: ClientMsg): Unit = {
val msg: String = event.getBody
store(msg)
}
override def onException(event: ClientEvent): Unit = {
}
})
} catch {
case ce: java.net.ConnectException =>
System.out.println("Could not connect")
case t: Throwable =>
System.out.println("Error receiving data")
}
}
}
}
==================================================================
Create Stream
class MyStream(sc: SparkContext, sqlContext: SQLContext, cpDir: String) {
def creatingFunc(): StreamingContext = {
val ssc = new StreamingContext(sc, Seconds(3))
// Set the active SQLContext so that we can access it statically within the foreachRDD
SQLContext.setActiveSession(sqlContext)
ssc.checkpoint(cpDir)
val ClientId = <Myclientid>
val Token = <Mytoken>
val env = "STAGE"
val stream = ssc.receiverStream(new MyReceiver(ClientId, Token, env))
stream.foreachRDD { rdd => println("Here"+rdd.take(10).mkString(", "))
}
ssc
}
}
==================================================================
Start Streaming
val checkpoint_dir = <my_checkpoint_dir>
val MyDataSourceStream = new MyStream(sc, sqlContext, checkpoint_dir)
val ssc = StreamingContext.getActiveOrCreate(checkpoint_dir, MyDataSourceStream.creatingFunc _)
ssc.start()
ssc.awaitTermination()
Updates:
Since it's an internal source, I cannot share Client source code. But I've tested the connection. It works for below code and msg can be printed out correctly. You can think of Client as an external lib which has no connection issues.
val ClientId = <myclientid>
val Token = <mytoken>
val client = new EVClient(ClientId, Token, "STAGE")
client.connect()
client.listen(Client.Topic, new ClientMsgHandler() {
override def onEvent(event: ClientMsg): Unit = {
val res = event.getBody
println(res)
}
override def onException(event: ClientEvent): Unit = {
}
})
I have a dataframe let's say:
val someDF = Seq(
(8, "bat"),
(64, "mouse"),
(-27, "horse")
).toDF("number", "word")
I want to send that dataframe to a kafka topic using avro serialization and using schema registry. I believe I'm almost there, but I can't seem to get past the Task not serializable error. I understand there is a sink for kafka, but it doesn't communicate with the schema registry which is a requirement.
object Holder extends Serializable{
def prop(): java.util.Properties = {
val props = new Properties()
props.put("schema.registry.url", schemaRegistryURL)
props.put("key.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("value.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("schema.registry.url", schemaRegistryURL)
props.put("bootstrap.servers", brokers)
props
}
def vProps(props: java.util.Properties): kafka.utils.VerifiableProperties = {
val vProps = new kafka.utils.VerifiableProperties(props)
vProps
}
def messageSchema(vProps: kafka.utils.VerifiableProperties): org.apache.avro.Schema = {
val ser = new KafkaAvroEncoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(subjectValueName)
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
messageSchema
}
def avroRecord(messageSchema: org.apache.avro.Schema): org.apache.avro.generic.GenericData.Record = {
val avroRecord = new GenericData.Record(messageSchema)
avroRecord
}
def ProducerRecord(avroRecord:org.apache.avro.generic.GenericData.Record): org.apache.kafka.clients.producer.ProducerRecord[org.apache.avro.generic.GenericRecord,org.apache.avro.generic.GenericRecord] = {
val record = new ProducerRecord[GenericRecord, GenericRecord](topicWrite, avroRecord)
record
}
def producer(props: java.util.Properties): KafkaProducer[GenericRecord, GenericRecord] = {
val producer = new KafkaProducer[GenericRecord, GenericRecord](props)
producer
}
}
val prod: (String, String) => String = (
number: String,
word: String,
) => {
val prop = Holder.prop()
val vProps = Holder.vProps(prop)
val mSchema = Holder.messageSchema(vProps)
val aRecord = Holder.avroRecord(mSchema)
aRecord.put("number", number)
aRecord.put("word", word)
val record = Holder.ProducerRecord(aRecord)
val producer = Holder.producer(prop)
producer.send(record)
"sent"
}
val prodUDF: org.apache.spark.sql.expressions.UserDefinedFunction =
udf((
Number: String,
word: String,
) => prod(number,word))
val testDF = firstDF.withColumn("sent", prodUDF(col("number"), col("word")))
KafkaProducer is not serializable.
Create the KafkaProducer inside prod() instead of creating it outside.
Hi am having a spark streaming program which is reading the events from eventhub and pushing it topics. for processing each batch it is taking almost 10 times the batch time.
when am trying to implement multithreading am not able to see much difference in the processing time.
Is there any way by which I can increase the performance either by doing parallel processing. or start some 1000 threads at a time and just keep pushing the messages.
class ThreadExample(msg:String) extends Thread{
override def run {
var test = new PushToTopicDriver(msg)
test.push()
// println(msg)
}
}
object HiveEventsDirectStream {
def b2s(a: Array[Byte]): String = new String(a)
def main(args: Array[String]): Unit = {
val eventhubnamespace = "namespace"
val progressdir = "/Event/DirectStream/"
val eventhubname_d = "namespacestream"
val ehParams = Map[String, String](
"eventhubs.policyname" -> "PolicyKeyName",
"eventhubs.policykey" -> "key",
"eventhubs.namespace" -> "namespace",
"eventhubs.name" -> "namespacestream",
"eventhubs.partition.count" -> "30",
"eventhubs.consumergroup" -> "$default",
"eventhubs.checkpoint.dir" -> "/EventCheckpoint_0.1",
"eventhubs.checkpoint.interval" -> "2"
)
println("testing spark")
val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[4]").setAppName("Eventhubs_Test")
conf.registerKryoClasses(Array(classOf[PublishToTopic]))
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sc= new SparkContext(conf)
val hiveContext = new HiveContext(sc)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val pool:ExecutorService=Executors.newFixedThreadPool(30)
val ssc = new StreamingContext(sc, Seconds(2))
var dataString :RDD[String] =sc.emptyRDD
val stream=EventHubsUtils.createDirectStreams(ssc,eventhubnamespace,progressdir,Map(eventhubname_d -> ehParams))
val kv1 = stream.map(receivedRecord => (new String(receivedRecord.getBody))).persist()
kv1.foreachRDD(rdd_1 => rdd_1.foreachPartition(line => line.foreach(msg => {var t1 = new ThreadExample(msg) t1.start()})))
ssc.start()
ssc.awaitTermination()
}
}
Thanks,
Ankush Reddy.
I write a demo to write data to hbase, but no response, no error, no log.
My hbase is 0.98, hadoop 2.3, spark 1.4.
And I run in yarn-client mode. any idea? thanks.
object SparkConnectHbase2 extends Serializable {
def main(args: Array[String]) {
new SparkConnectHbase2().toHbase();
}
}
class SparkConnectHbase2 extends Serializable {
def toHbase() {
val conf = new SparkConf().setAppName("ljh_ml3");
val sc = new SparkContext(conf)
val tmp = sc.parallelize(Array(601, 701, 801, 901)).foreachPartition({ a =>
val configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.property.clientPort", "2181");
configuration.set("hbase.zookeeper.quorum", “192.168.1.66");
configuration.set("hbase.master", “192.168.1.66:60000");
val table = new HTable(configuration, "ljh_test4");
var put = new Put(Bytes.toBytes(a+""));
put.add(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes(a + "value"));
table.put(put);
table.flushCommits();
})
}
}
thanks.
Write to hbase table
import org.apache.hadoop.hbase.client.{HBaseAdmin, HTable, Put}
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, HColumnDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark._
val hconf = HBaseConfiguration.create()
hconf.set(TableInputFormat.INPUT_TABLE, tablename)
val admin = new HBaseAdmin(hconf)
if(!admin.isTableAvailable(tablename)) {
val tabledesc= new HTableDescriptor(tablename)
tabledesc.addFamily(new HColumnDescriptor("cf1".getBytes()));
admin.createTable(tabledesc)
}
val newtable= new HTable(hconf, tablename);
val put = new Put(new String("row").getBytes());
put .add("cf1".getBytes(), "col1".getBytes(), new String("data").getBytes());
newtable.put(put);
newtable.flushCommits();
val hbaserdd = sc.newAPIHadoopRDD(hconf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])