I want to read data from a confluent cloud topic and then write in another topic.
At localhost, I haven't had any major problems. But the schema registry of confluent cloud requires to pass some authentication data that I don't know how to enter them:
basic.auth.credentials.source=USER_INFO
schema.registry.basic.auth.user.info=:
schema.registry.url=https://xxxxxxxxxx.confluent.cloudBlockquote
Below is the current code:
import com.databricks.spark.avro.SchemaConverters
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.sql.SparkSession
object AvroConsumer {
private val topic = "transactions"
private val kafkaUrl = "http://localhost:9092"
private val schemaRegistryUrl = "http://localhost:8081"
private val schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
private val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
private val avroSchema = schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
private var sparkSchema = SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("ConfluentConsumer")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark.udf.register("deserialize", (bytes: Array[Byte]) =>
DeserializerWrapper.deserializer.deserialize(bytes)
)
val kafkaDataFrame = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", kafkaUrl)
.option("subscribe", topic)
.load()
val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")
import org.apache.spark.sql.functions._
val formattedDataFrame = valueDataFrame.select(
from_json(col("message"), sparkSchema.dataType).alias("parsed_value"))
.select("parsed_value.*")
formattedDataFrame
.writeStream
.format("console")
.option("truncate", false)
.start()
.awaitTermination()
}
object DeserializerWrapper {
val deserializer = kafkaAvroDeserializer
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val genericRecord = super.deserialize(bytes).asInstanceOf[GenericRecord]
genericRecord.toString
}
}
}
I think I have to pass this authentication data to CachedSchemaRegistryClient but I'm not sure if so and how.
I've finally been able to pass the properties.
I leave the lines that gave the solution.
val restService = new RestService(schemaRegistryURL)
val props = Map(
"basic.auth.credentials.source" -> "USER_INFO",
"schema.registry.basic.auth.user.info" -> "secret:secret"
).asJava
var schemaRegistryClient = new CachedSchemaRegistryClient(restService, 100, props)
Related
We can use following api to write dataframe into local files.
df.write.parquet(path)
df.write.json(path)
However, Can I write into a parquet and a json in one time without compute the dataframe twice ?
By the way , I dont want to cache the data in memory, because it's too big.
If you don't cache/persist the dataframe, then it'll will need re-computed for each output format.
We can implement an org.apache.spark.sql.execution.datasources.FileFormat to do such thing.
DuplicateOutFormat demo
/**
* Very Dangerous Toy Code. DO NOT USE IN PRODUCTION.
*/
class DuplicateOutFormat
extends FileFormat
with DataSourceRegister
with Serializable {
override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = {
throw new UnsupportedOperationException()
}
override def prepareWrite(sparkSession: SparkSession,
job: Job,
options: Map[String, String],
dataSchema: StructType): OutputWriterFactory = {
val format1 = options("format1")
val format2 = options("format2")
val format1Instance = DataSource.lookupDataSource(format1, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val format2Instance = DataSource.lookupDataSource(format2, sparkSession.sessionState.conf)
.newInstance().asInstanceOf[FileFormat]
val writerFactory1 = format1Instance.prepareWrite(sparkSession, job, options, dataSchema)
val writerFactory2 = format2Instance.prepareWrite(sparkSession, job, options, dataSchema)
new OutputWriterFactory {
override def getFileExtension(context: TaskAttemptContext): String = ".dup"
override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
val path1 = path.replace(".dup", writerFactory1.getFileExtension(context))
val path2 = path.replace(".dup", writerFactory2.getFileExtension(context))
val writer1 = writerFactory1.newInstance(path1, dataSchema, context)
val writer2 = writerFactory2.newInstance(path2, dataSchema, context)
new OutputWriter {
override def write(row: InternalRow): Unit = {
writer1.write(row)
writer2.write(row)
}
override def close(): Unit = {
writer1.close()
writer2.close()
}
}
}
}
}
override def shortName(): String = "dup"
}
SPI
we should make a SPI file /META-INF/services/org.apache.spark.sql.sources.DataSourceRegister, content:
com.github.sparkdemo.DuplicateOutFormat.
demo usage
class DuplicateOutFormatTest extends FunSuite {
val spark = SparkSession.builder()
.master("local")
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
test("testDuplicateWrite") {
val data = Array(
("k1", "fa", "20210901", 16),
("k2", null, "20210902", 15),
("k3", "df", "20210903", 14),
("k4", null, "20210904", 13)
)
val tempDir = System.getProperty("java.io.tmpdir") + "spark-dup-test" + System.nanoTime()
val df = sc.parallelize(data).toDF("k", "col2", "day", "col4")
df.write
.option("format1", "csv")
.option("format2", "orc")
.format("dup").save(tempDir)
df.show(1000, false)
}
}
WARNING
Spark SQL couple some sth in DataFrameWriter#saveToV1Source and other source code, that we can't change. This custom DuplicateOutFormat is just for demo, lacking of test. Full demo in github.
I am trying to stream twitter data using Apache Spark in Intellij however when i use the function coalesce , it says that it cannot resolve symbol coalesce. Here is my main code:
val spark = SparkSession.builder().appName("twitterStream").master("local[*]").getOrCreate()
import spark.implicits._
val sc: SparkContext = spark.sparkContext
val streamContext = new StreamingContext(sc, Seconds(5))
val filters = Array("Singapore")
val filtered = TwitterUtils.createStream(streamContext, None, filters)
val englishTweets = filtered.filter(_.getLang() == "en")
//englishTweets.print()
englishTweets.foreachRDD{rdd =>
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val tweets = rdd.map( field =>
(
field.getId,
field.getUser.getScreenName,
field.getCreatedAt.toInstant.toString,
field.getText.toLowerCase.split(" ").filter(_.matches("^[a-zA-Z0-9 ]+$")).fold("")((a, b) => a + " " + b).trim,
sentiment(field.getText)
)
)
val tweetsdf = tweets.toDF("userID", "user", "createdAt", "text", "sentimentType")
tweetsdf.printSchema()
tweetsdf.show(false)
}.coalesce(1).write.csv("hdfs://localhost:9000/usr/sparkApp/test/testing.csv")
I have tried with my own dataset, and I have read a dataset and while writing I have applied coalesce function and it is giving results, please refer to this it may help you.
import org.apache.spark.sql.SparkSession
import com.spark.Rdd.DriverProgram
import org.apache.log4j.{ Logger, Level }
import org.apache.spark.sql.SaveMode
import java.sql.Date
object JsonDataDF {
System.setProperty("hadoop.home.dir", "C:\\hadoop");
System.setProperty("hadoop.home.dir", "C:\\hadoop"); // This is the system property which is useful to find the winutils.exe
Logger.getLogger("org").setLevel(Level.WARN) // This will remove Logs
case class AOK(appDate:Date, arr:String, base:String, Comments:String)
val dp = new DriverProgram
val spark = dp.getSparkSession()
def main(args : Array[String]): Unit = {
import spark.implicits._
val jsonDf = spark.read.option("multiline", "true").json("C:\\Users\\34979\\Desktop\\Work\\Datasets\\JSONdata.txt").as[AOK]
jsonDf.coalesce(1) // Refer Here
.write
.mode(SaveMode.Overwrite)
.option("header", "true")
.format("csv")
.save("C:\\Users\\34979\\Desktop\\Work\\Datasets\\JsonToCsv")
}
}
I am trying to send the records from kinesis streams to AWS SNS topic. but nothing seems to happen and I do not receive the messages on the topic. Below is my code. Please advise what I am doing incorrect here. I have tried both foreach and foreachBatch. Pleae refer to the commented section in the code for foreachBatch. To test out I have even tried hardcoding a string to see if foreach works but no luck. to Help please!
//Stream reader (Consumer)
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import com.amazonaws.services.kinesis.model.PutRecordRequest
import com.amazonaws.services.kinesis.AmazonKinesisClientBuilder
import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
import java.nio.ByteBuffer
import scala.util.Random
import java.util.Base64
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.from_json
import com.amazonaws.services.sns.{AmazonSNS, AmazonSNSClientBuilder}
import com.amazonaws.services._
import org.apache.spark.sql._
import com.amazonaws.services.sns.model._
val dataSchema = new StructType()
.add("serialnumber", StringType)
.add("purposeId", StringType)
.add("action", StringType)
.add("locale", StringType)
.add("datetime", TimestampType)
.add("ingestiontime", LongType)
val kinesisStreamName = "mystream"
val kinesisRegion = "us-west-2"
val kinesisDF = spark.readStream
.format("kinesis")
.option("streamName", kinesisStreamName)
.option("region", kinesisRegion)
.option("initialPosition", "latest") //"TRIM_HORIZON")
.load()
val recordsStreamDf = kinesisDF
.selectExpr("cast (data as STRING) jsonData")
.select(from_json($"jsonData", dataSchema)/*.cast("struct<installmentNo: integer, loanId: integer>")*/.as("consents"))
.select("consents.*")
display(recordsStreamDf)
val query =
recordsStreamDf
.writeStream.foreach(
new ForeachWriter[Row] {
def open(partitionId: Long, version: Long): Boolean = {
true
}
def process(record: Row): Unit = {
SNSPublisher("test").publishMessage()
}
def close(errorOrNull: Throwable): Unit = {
true
}
}
)
/*.foreachBatch{ (batchDF: DataFrame, batchId: Long) =>
val data = batchDF.withColumn("jsonData", to_json(struct($"serialnumber", $"purposeId", $"action", $"locale", $"datetime", $"ingestiontime"))).select($"jsonData").collect
for (i <- data) {
try {
SNSPublisher(i.getString(0)).publishMessage()
println("Metrics posted to SNS")
} catch {
case e: Exception =>
println("Exception posting message to SNS: " + e)
throw e
}
}
batchDF.foreachPartition {
partitionData =>
partitionData.foreach(row => {
SNSPublisher(row.getString(0)).publishMessage()
})
}
}*/
.outputMode("append")
.format("json")
.queryName("count")
.option("path", "s3://path/dev/user_personal/abhishek.ghosh/") // counts = name of the in-memory table
.option("checkpointLocation", "s3://path/dev/user_personal/abhishek.ghosh/checkpointLocation/")
//.outputMode("complete") // complete = all the counts should be in the table
.start()
//SNS Publisher class for notification of metrics
class SNSPublisher(message: String) {
implicit val sns: SNSPublisher = this
implicit val snsClient: AmazonSNS = AmazonSNSClientBuilder.standard.build
val this.message=message
var msgId=""
def publishMessage() {
msgId = snsClient.publish(new PublishRequest().withTopicArn("my_arn").withMessage(message)).getMessageId
}
}
object SNSPublisher {
def apply(message: String): SNSPublisher = {
new SNSPublisher(message)
}
} ```
just learn spark for a while, i found the api: saveAsNewAPIHadoopDataset when i use hbase, code like below, as far as know,this code can insert one row at a time , how to change it to batch put? i am a rookie ..please help...tks
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkContext, SparkConf}
/**
*
*/
object HbaseTest2 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "account"
sc.hadoopConfiguration.set("hbase.zookeeper.quorum","slave1,slave2,slave3")
sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", "2181")
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
val job = Job.getInstance(sc.hadoopConfiguration)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val indataRDD = sc.makeRDD(Array("1,jack,15","2,Lily,16","3,mike,16"))
val rdd = indataRDD.map(_.split(',')).map{arr=>{
val put = new Put(Bytes.toBytes(arr(0)))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("age"),Bytes.toBytes(arr(2).toInt))
(new ImmutableBytesWritable, put)
}}
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
sc.stop()
}
}
Actually you don't need to worry about this - under the hood, put(Put) and put(List<Put>) are identical. They both buffer messages and flush them in batches. There should be no noticeable performance difference.
I'm afraid the other answer is misguided.
saveAsNewAPIHadoopDataset performs single put.
To perform bulk put to hbase table, you can use hbase-spark connector.
The connector executes bulkPutFunc2 within mapPartition() so is efficient.
Your source code will change like below -
object HBaseTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("HBaseTest").setMaster("local")
val sc = new SparkContext(sparkConf)
val tablename = "account"
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "slave1,slave2,slave3")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
hbaseConf.set("zookeeper.znode.parent", "/hbase")
val hbaseContext = new HBaseContext(sc, hbaseConf)
val indataRDD = sc.makeRDD(Array("1,jack,15", "2,Lily,16", "3,mike,16"))
hbaseContext.bulkPut(indataRDD, TableName.valueOf(tablename), bulkPutFunc2)
sc.stop()
}
def bulkPutFunc2(arrayRec : String): Put = {
val rec = arrayRec.split(",")
val put = new Put(Bytes.toBytes(rec(0).toInt))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("name"), Bytes.toBytes(rec(1)))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("age"), Bytes.toBytes(rec(2).toInt))
put
}
}
pom.xml would have following entry -
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.2.0-cdh5.12.1</version>
<dependency>
I write a demo to write data to hbase, but no response, no error, no log.
My hbase is 0.98, hadoop 2.3, spark 1.4.
And I run in yarn-client mode. any idea? thanks.
object SparkConnectHbase2 extends Serializable {
def main(args: Array[String]) {
new SparkConnectHbase2().toHbase();
}
}
class SparkConnectHbase2 extends Serializable {
def toHbase() {
val conf = new SparkConf().setAppName("ljh_ml3");
val sc = new SparkContext(conf)
val tmp = sc.parallelize(Array(601, 701, 801, 901)).foreachPartition({ a =>
val configuration = HBaseConfiguration.create();
configuration.set("hbase.zookeeper.property.clientPort", "2181");
configuration.set("hbase.zookeeper.quorum", “192.168.1.66");
configuration.set("hbase.master", “192.168.1.66:60000");
val table = new HTable(configuration, "ljh_test4");
var put = new Put(Bytes.toBytes(a+""));
put.add(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes(a + "value"));
table.put(put);
table.flushCommits();
})
}
}
thanks.
Write to hbase table
import org.apache.hadoop.hbase.client.{HBaseAdmin, HTable, Put}
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, HColumnDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark._
val hconf = HBaseConfiguration.create()
hconf.set(TableInputFormat.INPUT_TABLE, tablename)
val admin = new HBaseAdmin(hconf)
if(!admin.isTableAvailable(tablename)) {
val tabledesc= new HTableDescriptor(tablename)
tabledesc.addFamily(new HColumnDescriptor("cf1".getBytes()));
admin.createTable(tabledesc)
}
val newtable= new HTable(hconf, tablename);
val put = new Put(new String("row").getBytes());
put .add("cf1".getBytes(), "col1".getBytes(), new String("data").getBytes());
newtable.put(put);
newtable.flushCommits();
val hbaserdd = sc.newAPIHadoopRDD(hconf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])