Getting error while connecting kerberos secured phoenix from Spark - apache-spark

I am getting following error while connecting to kerberos secured phoenix environment through spark java code.
ASL authentication failed. The most likely cause is missing or invalid credentials. Consider 'kinit'. javax.security.sasl.SaslException: GSS initiate failed
Caused by: org.ietf.jgss.GSSException: No valid credentials provided (Mechanism level: Failed to find any Kerberos tgt)
at sun.security.jgss.krb5.Krb5InitCredential.getInstance(Krb5InitCredential.java:147"
My spark code is as follows:-
SparkConf sparkConf = new SparkConf()
.setMaster("local[4]")
.setAppName("phoenix-test")
.set("hadoop.security.authentication", "kerberos")
.set("hbase.security.authentication", "kerberos")
.set("hadoop.rpc.protection", "privacy")
.set("spark.yarn.principal", "infinityadm/lswpbacrapn3d.nam.nsroot.net#NAMUXDEV.DYN.NSROOT.NET")
.set("spark.yarn.keytab", "C://files//infinityadm_lswpbacrapn3d.nam.nsroot.net#NAMUXDEV.DYN.NSROOT.NET.keytab")
sc = new SparkContext(sparkConf);
JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf));
SQLContext sqlContext = new SQLContext(jsc);
DataFrame fromPhx = sqlContext
.read()
.format("org.apache.phoenix.spark")
.option("table", "ODS.TRF_DEF_PRM")
.option("zkURL", "url:/hbase-dev")
/*.option("zkURL","lswpbacrapn1d,lswpbacrapn2d,lswpbacrapn3d:2181:/hbase-dev")*/
.load();
My vm options in intellij are as follows:-
-DCPB_LOG_PATH=C:/Users/PB42185/Logs
-DSERVICE_NAME=infinity-utils
-DJava.security.krb5.conf=C:/Users/2185/krb5.conf
-DJava.security.krb5.ini=C:/Users/2185/krb5.ini
-DJava.security.krb5.realm=#APACUXUAT.DYN.NSROOT.NET
-Dzookeeper.sasl.client=false
-Dzookeeper.sasl.client.username=adm
-Dsun.security.krb5.debug=true

Related

NoSuchMethodError trying to ingest HDFS data into Elasticsearch

I'm using Spark 3.12, Scala 2.12, Hadoop 3.1.1.3.1.2-50, Elasticsearch 7.10.1 (due to license issues), Centos 7
to try an ingest json data in gzip files located on HDFS into Elasticsearch using spark streaming.
I get a
Logical Plan:
FileStreamSource[hdfs://pct/user/papago-mlops-datalake/raw/mt-log/engine=n2mt/year=2022/date=0430/hour=00]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:356)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:244)
Caused by: java.lang.NoSuchMethodError: org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(Lorg/apache/spark/sql/SparkSession;Lorg/apache/spark/sql/execution/QueryExecution;Lscala/Function0;)Ljava/lang/Object;
at org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink.addBatch(EsSparkSqlStreamingSink.scala:62)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:586)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:584)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:584)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:226)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:194)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:188)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:334)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:317)
... 1 more
ApplicationMaster host: ac3m8x2183.bdp.bdata.ai
ApplicationMaster RPC port: 39673
queue: batch
start time: 1654588583366
final status: FAILED
tracking URL: https://gemini-rm2.bdp.bdata.ai:9090/proxy/application_1654575947385_29572/
user: papago-mlops-datalake
Exception in thread "main" org.apache.spark.SparkException: Application application_1654575947385_29572 finished with failed status
at org.apache.spark.deploy.yarn.Client.run(Client.scala:1269)
at org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1627)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:904)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
using
implementation("org.elasticsearch:elasticsearch-hadoop:8.2.2")
implementation("com.typesafe:config:1.4.2")
implementation("org.apache.spark:spark-sql_2.12:3.1.2")
testImplementation("org.scalatest:scalatest_2.12:3.2.12")
testRuntimeOnly("com.vladsch.flexmark:flexmark-all:0.61.0")
compileOnly("org.apache.spark:spark-sql_2.12:3.1.2")
compileOnly("org.apache.spark:spark-core_2.12:3.1.2")
compileOnly("org.apache.spark:spark-launcher_2.12:3.1.2")
compileOnly("org.apache.spark:spark-streaming_2.12:3.1.2")
compileOnly("org.elasticsearch:elasticsearch-spark-30_2.12:8.2.2")
libraries. I tried using ES-Hadoop version 7.10.1, but ES-Spark only supports down to 7.12.0 for Spark 3.0 and I still get the same error.
My code is pretty simple
def main(args: Array[String]): Unit = {
// Set the log level to only print errors
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession
.builder()
.config(ConfigurationOptions.ES_NET_HTTP_AUTH_USER, elasticsearchUser)
.config(ConfigurationOptions.ES_NET_HTTP_AUTH_PASS, elasticsearchPass)
.config(ConfigurationOptions.ES_NODES, elasticsearchHost)
.config(ConfigurationOptions.ES_PORT, elasticsearchPort)
.appName(appName)
.master(master)
.getOrCreate()
val streamingDF: DataFrame = spark.readStream
.schema(jsonSchema)
.format("org.apache.spark.sql.execution.datasources.json.JsonFileFormat")
.load(pathToJSONResource)
streamingDF.writeStream
.outputMode(outputMode)
.format(destination)
.option("checkpointLocation", checkpointLocation)
.start(indexAndDocType)
.awaitTermination()
// Stop the session
spark.stop()
}
}
If I can't use the ES-Hadoop libraries is there another way I can go about ingesting JSON into ES from HDFS?

Configure Pyspark AWS credentials within docker container

I'm using Docker to develop local AWS glue jobs with pyspark. The song_data.py file contains the AWS glue job. I configured the spark session with my AWS credentials although the errors below suggest otherwise. Within the file, I set up 4 different try statements using glue context methods to create a dynamic frame. Here's the glue job file (song_data.py):
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from configparser import ConfigParser
from pyspark import SparkConf
config = ConfigParser()
config.read_file(open('/app/config/aws.cfg'))
conf = (
SparkConf()
.set('spark.hadoop.fs.s3a.access.key', config.get('AWS', 'KEY'))
.set('spark.hadoop.fs.s3a.secret.key', config.get('AWS', 'SECRET'))
.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
glueContext = GlueContext(spark)
conf_dict = spark.sparkContext.getConf().getAll()
print(conf_dict)
try:
print('Attempt 1: spark.read.json')
url = 's3a://sparkify-dend-analytics/song_data/A/A/A/TRAAAAW128F429D538.json'
spark.read.json(url).show(1)
except Exception as e:
print(e)
try:
print('Attempt 2: create_dynamic_frame.from_options')
song_df = glueContext.create_dynamic_frame.from_options(
connection_type='s3',
connection_options={"paths": [ "s3a://sparkify-dend-analytics/song_data/"]},
format='json')
print ('Count: ', song_df.count())
print('Schema: ')
song_df.printSchema()
except Exception as e:
print(e)
try:
print('Attempt 3: create_dynamic_frame.from_catalog')
song_df = glueContext.create_dynamic_frame.from_catalog(
database='sparkify',
table_name='song_data')
print ('Count: ', song_df.count())
print('Schema: ')
song_df.printSchema()
except Exception as e:
print(e)
try:
print('Attempt 4: create_dynamic_frame_from_catalog')
song_df = glueContext.create_dynamic_frame_from_catalog(
database='sparkify',
table_name='song_data')
print ('Count: ', song_df.count())
print('Schema: ')
song_df.printSchema()
except Exception as e:
print(e)
The command I use to run the glue job is: gluesparksubmit glue_etl_scripts/song_data.py --JOB-NAME test. Here are the short versions of the error outputs for each try statement:
Attempt 1: spark.read.json()
WARN FileStreamSink: Error while looking for metadata directory.
An error occurred while calling o87.json.
: org.apache.hadoop.fs.s3a.AWSClientIOException: doesBucketExist on sparkify-dend-analytics:
com.amazonaws.AmazonClientException: No AWS Credentials provided by
DefaultAWSCredentialsProviderChain : com.amazonaws.SdkClientException: Unable to load AWS
credentials from any provider in the chain: [EnvironmentVariableCredentialsProvider: Unable to
load AWS credentials from environment variables (AWS_ACCESS_KEY_ID (or AWS_ACCESS_KEY) and
AWS_SECRET_KEY (or AWS_SECRET_ACCESS_KEY)), SystemPropertiesCredentialsProvider: Unable to load
AWS credentials from Java system properties (aws.accessKeyId and aws.secretKey),
WebIdentityTokenCredentialsProvider: You must specify a value for roleArn and roleSessionName,
com.amazonaws.auth.profile.ProfileCredentialsProvider#401a5902: profile file cannot be null,
com.amazonaws.auth.EC2ContainerCredentialsProviderWrapper#2b6e2cf9: Failed to connect to service
endpoint: ]: No AWS Credentials provided by DefaultAWSCredentialsProviderChain :
com.amazonaws.SdkClientException: Unable to load AWS credentials from any provider in the chain:
[EnvironmentVariableCredentialsProvider: Unable to load AWS credentials from environment
variables (AWS_ACCESS_KEY_ID (or AWS_ACCESS_KEY) and AWS_SECRET_KEY (or AWS_SECRET_ACCESS_KEY)),
SystemPropertiesCredentialsProvider: Unable to load AWS credentials from Java system properties
(aws.accessKeyId and aws.secretKey), WebIdentityTokenCredentialsProvider: You must specify a
value for roleArn and roleSessionName,
com.amazonaws.auth.profile.ProfileCredentialsProvider#401a5902: profile file cannot be null,
com.amazonaws.auth.EC2ContainerCredentialsProviderWrapper#2b6e2cf9: Failed to connect to service
endpoint: ]
Attempt 2: create_dynamic_frame.from_options()
WARN InstanceMetadataServiceResourceFetcher: Fail to retrieve token
com.amazonaws.SdkClientException: Failed to connect to service endpoint:
....
Caused by: java.net.ConnectException: Connection refused (Connection refused)
....
An error occurred while calling o125.getDynamicFrame.
: org.apache.hadoop.fs.s3a.AWSClientIOException: (same AWSClientIOException as above)
.....
Caused by: com.amazonaws.SdkClientException: Unable to load AWS credentials from any provider in the chain:
Attempt 3: create_dynamic_frame.from_catalog()
WARN InstanceMetadataServiceResourceFetcher: Fail to retrieve token
com.amazonaws.SdkClientException: Failed to connect to service endpoint:
.....
Caused by: java.net.ConnectException: Connection refused (Connection refused)
Attempt 4: create_dynamic_frame_from_catalog()
Same as attempt 3
When I printed out the configuration dict for the spark session, the aws access and secret key were valid.
Here's the spark configuration dict printed from running spark.sparkContext.getConf().getAll():
[('spark.app.name', 'song_data.py'), ('spark.driver.host', '73d3647fdf5b'),
('spark.hadoop.fs.s3a.secret.key', 'xxxxxxx'), ('spark.submit.pyFiles', '/glue/aws-glue-
libs/PyGlue.zip'), ('spark.executor.id', 'driver'), ('spark.driver.extraClassPath', '/glue/aws-
glue-libs/jarsv1/*'), ('spark.app.id', 'local-1593063861647'), ('spark.driver.port', '40655'),
('spark.executor.extraClassPath', '/glue/aws-glue-libs/jarsv1/*'), ('spark.rdd.compress',
'True'), ('spark.hadoop.fs.s3a.access.key', 'xxxxxxx'), ('spark.files', 'file:///glue/aws-glue-
libs/PyGlue.zip'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'),
('spark.submit.deployMode', 'client'), ('fs.s3.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')]
Let me know if the Dockerfile is needed or any other code.

Though I have setMaster as local, my spark application gives error

I have the following application (I am starting and stopping spark) in Windows. I use Scala-IDE(Eclipse). I get "A master URL must be set in your configuration" error even though I have set it here. I use spark-2.4.4 version.
Can someone please help me to fix this issue.
import org.apache.spark._;
import org.apache.spark.sql._;
object SampleApp {
def main(args: Array[String]) {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("Simple Application")
val sc = new SparkContext(conf)
sc.stop()
}
}
The error is:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/10/28 22:58:56 INFO SparkContext: Running Spark version 2.4.4
19/10/28 22:58:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19/10/28 22:58:56 ERROR SparkContext: Error initializing SparkContext.
org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:368)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$5(SparkSession.scala:935)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:926)
at com.spark.renga.SampleApp$.main(SampleApp.scala:8)
at com.spark.renga.SampleApp.main(SampleApp.scala)
19/10/28 22:58:56 ERROR Utils: Uncaught exception in thread main
java.lang.NullPointerException
at org.apache.spark.SparkContext.postApplicationEnd(SparkContext.scala:2416)
at org.apache.spark.SparkContext.$anonfun$stop$2(SparkContext.scala:1931)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1931)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:585)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$5(SparkSession.scala:935)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:926)
at com.spark.renga.SampleApp$.main(SampleApp.scala:8)
at com.spark.renga.SampleApp.main(SampleApp.scala)
19/10/28 22:58:56 INFO SparkContext: Successfully stopped SparkContext
Exception in thread "main" org.apache.spark.SparkException: A master URL must be set in your configuration
at org.apache.spark.SparkContext.<init>(SparkContext.scala:368)
at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$5(SparkSession.scala:935)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:926)
at com.spark.renga.SampleApp$.main(SampleApp.scala:8)
at com.spark.renga.SampleApp.main(SampleApp.scala)
if you are using version 2.4.4 try this:
import org.apache.spark.sql.SparkSession
object SampleApp {
def main(args: Array[String]) {
val spark = SparkSession
.builder
.master("local[*]")
.appName("test")
.getOrCreate()
println(spark.sparkContext.version)
spark.stop()
}
}

Failed to load com.saprk.demo.Hive. java.lang.ClassNotFoundException: com.saprk.demo.Hive

package com.saprk.demo
import org.apache.spark.sql.SparkSession
object Hive {
def main(args: Array[String]) {
val spark = SparkSession
.builder()
.master("local")
.appName("Spark SQL basic example")
.config("hive.metastore.warehouse.dir", "hdfs://user/hive/warehouse")
.enableHiveSupport()
.getOrCreate()
spark.sql("create database employee")
spark.sql("show databases").show()
}
}
I am trying to create a database in Hive through spark and while submitting this on amazon emr i am getting exception
Failed to load com.saprk.demo.Hive. java.lang.ClassNotFoundException: com.saprk.demo.Hive

Spark Streaming works in Local mode but "stages fail" with "could not initialize class" in Client/Cluster mode

I have a Spark + Kafka streaming app that runs fine in Local mode, however when I try to launch it in yarn + local/cluster mode I get several errors like below
The first error I always see is
WARN TaskSetManager: Lost task 1.1 in stage 3.0 (TID 9, ip-xxx-24-129-36.ec2.internal, executor 2): java.lang.NoClassDefFoundError: Could not initialize class TestStreaming$
at TestStreaming$$anonfun$main$1$$anonfun$apply$1.apply(TestStreaming.scala:60)
at TestStreaming$$anonfun$main$1$$anonfun$apply$1.apply(TestStreaming.scala:59)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Next error I get is
ERROR JobScheduler: Error running job streaming job 1541786030000 ms.0
followed by
java.lang.NoClassDefFoundError: Could not initialize class
Spark version 2.1.0
Scala 2.11
Kafka version 10
Part of my code when I launch it loads the config in main. I pass this config file at runtime with -conf AFTER the jar (see below). I'm not quite sure but must I pass this config to the executors as well?
I launch my streaming app with the command below. One shows Local mode, the other shows client mode.
runJar = myProgram.jar
loggerPath=/path/to/log4j.properties
mainClass=TestStreaming
logger=-DPHDTKafkaConsumer.app.log4j=$loggerPath
confFile=application.conf
-----------Local Mode----------
SPARK_KAFKA_VERSION=0.10 nohup spark2-submit --driver-java-options
"$logger" --conf "spark.executor.extraJavaOptions=$logger" --class
$mainClass --master local[4] $runJar -conf $confFile &
-----------Client Mode----------
SPARK_KAFKA_VERSION=0.10 nohup spark2-submit --master yarn --conf >"spark.executor.extraJavaOptions=$logger" --conf >"spark.driver.extraJavaOptions=$logger" --class $mainClass $runJar -conf >$confFile &
Here is my code below. Been battling this for over a week now.
import Util.UtilFunctions
import UtilFunctions.config
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.log4j.Logger
object TestStreaming extends Serializable {
#transient lazy val logger: Logger = Logger.getLogger(getClass.getName)
def main(args: Array[String]) {
logger.info("Starting app")
UtilFunctions.loadConfig(args)
UtilFunctions.loadLogger()
val props: Map[String, String] = setKafkaProperties()
val topic = Set(config.getString("config.TOPIC_NAME"))
val conf = new SparkConf()
.setAppName(config.getString("config.SPARK_APP_NAME"))
.set("spark.streaming.backpressure.enabled", "true")
val spark = SparkSession.builder()
.config(conf)
.getOrCreate()
val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
ssc.sparkContext.setLogLevel("INFO")
ssc.checkpoint(config.getString("config.SPARK_CHECKPOINT_NAME"))
val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topic, props))
val distRecordsStream = kafkaStream.map(record => (record.key(), record.value()))
distRecordsStream.window(Seconds(10), Seconds(10))
distRecordsStream.foreachRDD(rdd => {
if(!rdd.isEmpty()) {
rdd.foreach(record => {
println(record._2) //value from kafka
})
}
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
def setKafkaProperties(): Map[String, String] = {
val deserializer = "org.apache.kafka.common.serialization.StringDeserializer"
val zookeeper = config.getString("config.ZOOKEEPER")
val offsetReset = config.getString("config.OFFSET_RESET")
val brokers = config.getString("config.BROKERS")
val groupID = config.getString("config.GROUP_ID")
val autoCommit = config.getString("config.AUTO_COMMIT")
val maxPollRecords = config.getString("config.MAX_POLL_RECORDS")
val maxPollIntervalms = config.getString("config.MAX_POLL_INTERVAL_MS")
val props = Map(
"bootstrap.servers" -> brokers,
"zookeeper.connect" -> zookeeper,
"group.id" -> groupID,
"key.deserializer" -> deserializer,
"value.deserializer" -> deserializer,
"enable.auto.commit" -> autoCommit,
"auto.offset.reset" -> offsetReset,
"max.poll.records" -> maxPollRecords,
"max.poll.interval.ms" -> maxPollIntervalms)
props
}
}

Resources