How to connect hbase through Spark-sql context? - apache-spark

I'm working on a scenario where i want to access hbase from spark sql and read it as data frame. below is the code.
import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import com.google.protobuf.ServiceException;
public class hbasesql {
public static void main(String args[])
{
SparkConf conf = new SparkConf().setAppName("HbaseStreamTest").setMaster("yarn-client");
conf.set("spark.executor.extraClassPath", "/etc/hbase/conf/hbase-site.xml");
conf.set("hbase.zookeeper.quorum", "=servername:2181");
conf.set("hbase.zookeeper.property.clientPort", "2181");
Configuration hconf = HBaseConfiguration.create();
hconf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));
hconf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
final JavaSparkContext context = new JavaSparkContext(conf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(context);
try {
HBaseAdmin.checkHBaseAvailable(hconf);
System.out.println("HBase is running"); //This works
} catch (ServiceException e) {
System.out.println("HBase is not running");
e.printStackTrace();
} catch (MasterNotRunningException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ZooKeeperConnectionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String sqlMapping = "KEY_FIELD STRING :key" + ", sql_firstname STRING c1:FIRSTNAME" + ","
+ "sql_lastname STRING c1:LASTNAME" ;
HashMap<String, String> colMap = new HashMap<String, String>();
colMap.put("hbase.columns.mapping", sqlMapping);
colMap.put("hbase.table", "testtable");
// DataFrame dfJail =
DataFrame df = sqlContext.read().format("org.apache.hadoop.hbase.spark").options(colMap).load();
//DataFrame df = sqlContext.load("org.apache.hadoop.hbase.spark", colMap);
// This is useful when issuing SQL text queries directly against the
// sqlContext object.
df.registerTempTable("temp_emp");
DataFrame result = sqlContext.sql("SELECT count(*) from temp_emp");
System.out.println("df " + df);
System.out.println("result " + result);
df.show();
}
}
But while running the same i'm getting the below exception.
17/02/22 00:44:28 INFO cluster.YarnClientSchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.8
17/02/22 00:44:28 INFO storage.BlockManagerMasterEndpoint: Registering block manager servername:32831 with 2.1 GB RAM, BlockManagerId(2, servername, 32831)
Exception in thread "main" java.lang.NullPointerException
at org.apache.hadoop.hbase.spark.HBaseRelation.<init>(DefaultSource.scala:175)
at org.apache.hadoop.hbase.spark.DefaultSource.createRelation(DefaultSource.scala:78)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:158)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119)
at hbasesql.main(hbasesql.java:65)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
17/02/22 00:44:28 INFO spark.SparkContext: Invoking stop() from shutdown hook
I guess sqlcontext from sparkjavacontext is not able to load hbase-site.xml configuration in to it. or is there any other problem?
Any help is very much appreciated.
Thanks in advance :)

Related

failed to create SFTP Session

I'm using the below code to connect to FTP server(s) for transferring files, server details are entered in postgres DB using a HTML interface and the spring integration code should take the config from the DB and rotate through the list and pull the file from a specific folder.
It try to connect then it says connected but nothing is transferred and then after couple of minutes it disconnects then tries again.
I have two branches in the DB to rotate but it also tries for the first one only.
Error of disconnect is "java.lang.IllegalStateException: failed to create SFTP Session"
Here is the full log.
2018-10-23 11:53:51.694 INFO 1940 --- [ask-scheduler-1] com.jcraft.jsch : Connecting to cai-notes-fs.cai.ei port 21
2018-10-23 11:53:51.834 INFO 1940 --- [ask-scheduler-1] com.jcraft.jsch : Connection established
2018-10-23 11:56:01.227 INFO 1940 --- [ask-scheduler-1] com.jcraft.jsch : Disconnecting from cai-notes-fs.cai.ei port 21
2018-10-23 11:56:01.242 ERROR 1940 --- [ask-scheduler-1] o.s.integration.handler.LoggingHandler : org.springframework.messaging.MessagingException: Problem occurred while synchronizing remote to local directory; nested exception is org.springframework.messaging.MessagingException: Failed to execute on session; nested exception is java.lang.IllegalStateException: failed to create SFTP Session
at org.springframework.integration.file.remote.synchronizer.AbstractInboundFileSynchronizer.synchronizeToLocalDirectory(AbstractInboundFileSynchronizer.java:331)
at org.springframework.integration.file.remote.synchronizer.AbstractInboundFileSynchronizingMessageSource.doReceive(AbstractInboundFileSynchronizingMessageSource.java:260)
at org.springframework.integration.file.remote.synchronizer.AbstractInboundFileSynchronizingMessageSource.doReceive(AbstractInboundFileSynchronizingMessageSource.java:65)
at org.springframework.integration.endpoint.AbstractFetchLimitingMessageSource.doReceive(AbstractFetchLimitingMessageSource.java:43)
at org.springframework.integration.endpoint.AbstractMessageSource.receive(AbstractMessageSource.java:154)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.springframework.aop.support.AopUtils.invokeJoinpointUsingReflection(AopUtils.java:343)
at org.springframework.aop.framework.ReflectiveMethodInvocation.invokeJoinpoint(ReflectiveMethodInvocation.java:197)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:163)
at org.springframework.integration.aop.AbstractMessageSourceAdvice.invoke(AbstractMessageSourceAdvice.java:43)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:185)
at org.springframework.aop.framework.JdkDynamicAopProxy.invoke(JdkDynamicAopProxy.java:212)
at com.sun.proxy.$Proxy121.receive(Unknown Source)
at org.springframework.integration.endpoint.SourcePollingChannelAdapter.receiveMessage(SourcePollingChannelAdapter.java:236)
at org.springframework.integration.endpoint.AbstractPollingEndpoint.doPoll(AbstractPollingEndpoint.java:249)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.springframework.aop.support.AopUtils.invokeJoinpointUsingReflection(AopUtils.java:343)
at org.springframework.aop.framework.ReflectiveMethodInvocation.invokeJoinpoint(ReflectiveMethodInvocation.java:197)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:163)
at org.springframework.transaction.interceptor.TransactionAspectSupport.invokeWithinTransaction(TransactionAspectSupport.java:294)
at org.springframework.transaction.interceptor.TransactionInterceptor.invoke(TransactionInterceptor.java:98)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:185)
at org.springframework.aop.framework.JdkDynamicAopProxy.invoke(JdkDynamicAopProxy.java:212)
at com.sun.proxy.$Proxy120.call(Unknown Source)
at org.springframework.integration.endpoint.AbstractPollingEndpoint$Poller.lambda$run$0(AbstractPollingEndpoint.java:378)
at org.springframework.integration.util.ErrorHandlingTaskExecutor.lambda$execute$0(ErrorHandlingTaskExecutor.java:53)
at org.springframework.core.task.SyncTaskExecutor.execute(SyncTaskExecutor.java:50)
at org.springframework.integration.util.ErrorHandlingTaskExecutor.execute(ErrorHandlingTaskExecutor.java:51)
at org.springframework.integration.endpoint.AbstractPollingEndpoint$Poller.run(AbstractPollingEndpoint.java:372)
at org.springframework.scheduling.support.DelegatingErrorHandlingRunnable.run(DelegatingErrorHandlingRunnable.java:54)
at org.springframework.scheduling.concurrent.ReschedulingRunnable.run(ReschedulingRunnable.java:93)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.springframework.messaging.MessagingException: Failed to execute on session; nested exception is java.lang.IllegalStateException: failed to create SFTP Session
at org.springframework.integration.file.remote.RemoteFileTemplate.execute(RemoteFileTemplate.java:445)
at org.springframework.integration.file.remote.synchronizer.AbstractInboundFileSynchronizer.synchronizeToLocalDirectory(AbstractInboundFileSynchronizer.java:286)
... 43 more
Caused by: java.lang.IllegalStateException: failed to create SFTP Session
at org.springframework.integration.sftp.session.DefaultSftpSessionFactory.getSession(DefaultSftpSessionFactory.java:393)
at org.springframework.integration.sftp.session.DefaultSftpSessionFactory.getSession(DefaultSftpSessionFactory.java:57)
at org.springframework.integration.file.remote.session.DelegatingSessionFactory.getSession(DelegatingSessionFactory.java:111)
at org.springframework.integration.file.remote.session.DelegatingSessionFactory.getSession(DelegatingSessionFactory.java:105)
at org.springframework.integration.file.remote.RemoteFileTemplate.execute(RemoteFileTemplate.java:431)
... 44 more
Caused by: java.lang.IllegalStateException: failed to connect
at org.springframework.integration.sftp.session.SftpSession.connect(SftpSession.java:272)
at org.springframework.integration.sftp.session.DefaultSftpSessionFactory.getSession(DefaultSftpSessionFactory.java:388)
... 48 more
Caused by: com.jcraft.jsch.JSchException: Session.connect: java.net.SocketException: Connection reset
at com.jcraft.jsch.Session.connect(Session.java:565)
at com.jcraft.jsch.Session.connect(Session.java:183)
at org.springframework.integration.sftp.session.SftpSession.connect(SftpSession.java:263)
... 49 more
2018-10-23 11:56:11.248 INFO 1940 --- [ask-scheduler-1] com.jcraft.jsch : Connecting to cai-notes-fs.cai.ei port 21
2018-10-23 11:56:11.414 INFO 1940 --- [ask-scheduler-1] com.jcraft.jsch : Connection established
//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by Fernflower decompiler)
//
Here is the correct code
package fefo.springframeworkftp.spring4ftpapp.configuration;
import com.jcraft.jsch.ChannelSftp;
import fefo.springframeworkftp.spring4ftpapp.model.Branch;
import fefo.springframeworkftp.spring4ftpapp.repository.BranchRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.integration.channel.DirectChannel;
import org.springframework.integration.channel.NullChannel;
import org.springframework.integration.dsl.IntegrationFlow;
import org.springframework.integration.dsl.IntegrationFlows;
import org.springframework.integration.dsl.Pollers;
import org.springframework.integration.dsl.SourcePollingChannelAdapterSpec;
import org.springframework.integration.expression.FunctionExpression;
import org.springframework.integration.file.remote.*;
import org.springframework.integration.file.remote.aop.RotatingServerAdvice;
import org.springframework.integration.file.remote.session.DelegatingSessionFactory;
import org.springframework.integration.file.remote.session.SessionFactory;
import org.springframework.integration.scheduling.PollerMetadata;
import org.springframework.integration.sftp.dsl.Sftp;
import org.springframework.integration.sftp.dsl.SftpInboundChannelAdapterSpec;
import org.springframework.integration.sftp.session.DefaultSftpSessionFactory;
import org.springframework.messaging.MessageChannel;
import org.springframework.stereotype.Component;
import java.io.File;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
#Configuration
#Component
public class SFTIntegration {
public static final String TIMEZONE_UTC = "UTC";
public static final String TIMESTAMP_FORMAT_OF_FILES = "yyyyMMddHHmmssSSS";
public static final String TEMPORARY_FILE_SUFFIX = ".part";
public static final int POLLER_FIXED_PERIOD_DELAY = 5000;
public static final int MAX_MESSAGES_PER_POLL = 100;
private static final Logger LOG = LoggerFactory.getLogger(SFTIntegration.class);
private static final String CHANNEL_INTERMEDIATE_STAGE = "intermediateChannel";
/* pulling the server config from postgres DB*/
private final BranchRepository branchRepository;
#Value("${app.temp-dir}")
private String localTempPath;
public SFTIntegration(BranchRepository branchRepository) {
this.branchRepository = branchRepository;
}
/**
* The default poller with 5s, 100 messages, RotatingServerAdvice and transaction.
*
* #return default poller.
*/
#Bean(name = PollerMetadata.DEFAULT_POLLER)
public PollerMetadata poller(){
return Pollers
.fixedDelay(POLLER_FIXED_PERIOD_DELAY)
.advice(advice())
.maxMessagesPerPoll(MAX_MESSAGES_PER_POLL)
.transactional()
.get();
}
/**
* The direct channel for the flow.
*
* #return MessageChannel
*/
#Bean
public MessageChannel stockIntermediateChannel() {
return new DirectChannel();
}
/**
* Get the files from a remote directory. Add a timestamp to the filename
* and write them to a local temporary folder.
*
* #return IntegrationFlow
*/
#Bean
public IntegrationFlow fileInboundFlowFromSFTPServer(){
final SftpInboundChannelAdapterSpec sourceSpec = Sftp.inboundAdapter(delegatingSFtpSessionFactory())
.preserveTimestamp(true)
.patternFilter("*.csv")
.deleteRemoteFiles(true)
.maxFetchSize(MAX_MESSAGES_PER_POLL)
.remoteDirectory("/")
.localDirectory(new File(localTempPath))
.temporaryFileSuffix(TEMPORARY_FILE_SUFFIX)
.localFilenameExpression(new FunctionExpression<String>(s -> {
final int fileTypeSepPos = s.lastIndexOf('.');
return DateTimeFormatter
.ofPattern(TIMESTAMP_FORMAT_OF_FILES)
.withZone(ZoneId.of(TIMEZONE_UTC))
.format(Instant.now())
+ "_"
+ s.substring(0, fileTypeSepPos)
+ s.substring(fileTypeSepPos);
}));
// Poller definition
final Consumer<SourcePollingChannelAdapterSpec> stockInboundPoller = endpointConfigurer -> endpointConfigurer
.id("stockInboundPoller")
.autoStartup(true)
.poller(poller());
return IntegrationFlows
.from(sourceSpec, stockInboundPoller)
.transform(File.class, p -> {
// log step
LOG.info("flow=stockInboundFlowFromAFT, message=incoming file: " + p);
return p;
})
.channel(CHANNEL_INTERMEDIATE_STAGE)
.get();
}
#Bean
public IntegrationFlow stockIntermediateStageChannel() {
return IntegrationFlows
.from(CHANNEL_INTERMEDIATE_STAGE)
.transform(p -> {
//log step
LOG.info("flow=stockIntermediateStageChannel, message=rename file: " + p);
return p;
})
//TODO
.channel(new NullChannel())
.get();
}
public DefaultSftpSessionFactory createNewSftpSessionFactory(final Branch branch){
final DefaultSftpSessionFactory factory = new DefaultSftpSessionFactory(false);
factory.setHost(branch.getHost());
factory.setUser(branch.getUsern());
factory.setPort(branch.getFtpPort());
factory.setPassword(branch.getPassword());
factory.setAllowUnknownKeys(true);
return factory;
}
#Bean
public DelegatingSessionFactory<ChannelSftp.LsEntry> delegatingSFtpSessionFactory(){
final List<Branch> branchConnections = new ArrayList<>();
branchRepository.findAll().forEach(branchConnections::add);
if(branchConnections.isEmpty()){
return null;
}
final Map<Object, SessionFactory<ChannelSftp.LsEntry>> factories = new LinkedHashMap<>(10);
for (Branch br : branchConnections) {
// create a factory for every key containing server type, url and port
if (factories.get(br.getId()) == null) {
factories.put(br.getId(), createNewSftpSessionFactory(br));
}
}
// use the first SF as the default
return new DelegatingSessionFactory<>(factories, factories.values().iterator().next());
}
#Bean
public RotatingServerAdvice advice(){
final List<Branch> branchConnections = new ArrayList<>();
branchRepository.findAll().forEach(branchConnections::add);
LOG.info("Found " + branchConnections.size() + " server entries for FEFO Stock.");
final List<RotatingServerAdvice.KeyDirectory> keyDirectories = new ArrayList<>();
for (final Branch br : branchConnections) {
keyDirectories
.add(new RotatingServerAdvice.KeyDirectory(br.getId().toString(), br.getFolderPath()));
}
/*final RotatingServerAdvice rot = new RotatingServerAdvice(
new MyStandardRotationPolicy(delegatingSFtpSessionFactory(), keyDirectories, true,
getFilter(), partnerConfigRepo));
return rot;*/
return new RotatingServerAdvice(delegatingSFtpSessionFactory(), keyDirectories, true);
}
}

Apache Spark can not read Kafka message Content

I am trying to create Apache Spark job to consume Kafka messages submitted in to a topic. To submit messages to the topic using kafka-console-producer as below.
./kafka-console-producer.sh --broker-list kafka1:9092 --topic my-own-topic
To read messages I am using spark-streaming-kafka-0-10_2.11 library. With the library manage to to read the total counts of the messages received to the topic. But I can not read ConsumerRecord object in the stream and when I try to read it entire application get blocked and can not print it in to the console. Note I am running Kafka, Zookeeper and Spark in docker containers. Help would be greatly appreciated.
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.HasOffsetRanges;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.apache.spark.streaming.kafka010.OffsetRange;
public class SparkKafkaStreamingJDBCExample {
public static void main(String[] args) {
// Start a spark instance and get a context
SparkConf conf =
new SparkConf().setAppName("Study Spark").setMaster("spark://spark-master:7077");
// Setup a streaming context.
JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(3));
// Create a map of Kafka params
Map<String, Object> kafkaParams = new HashMap<String, Object>();
// List of Kafka brokers to listen to.
kafkaParams.put("bootstrap.servers", "kafka1:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
// Do you want to start from the earliest record or the latest?
kafkaParams.put("auto.offset.reset", "earliest");
kafkaParams.put("enable.auto.commit", true);
// List of topics to listen to.
Collection<String> topics = Arrays.asList("my-own-topic");
// Create a Spark DStream with the kafka topics.
final JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
System.out.println("Study Spark Example Starting ....");
stream.foreachRDD(rdd -> {
if (rdd.isEmpty()) {
System.out.println("RDD Empty " + rdd.count());
return;
} else {
System.out.println("RDD not empty " + rdd.count());
OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
System.out.println("Partition Id " + TaskContext.getPartitionId());
OffsetRange o = offsetRanges[TaskContext.getPartitionId()];
System.out.println("Topic " + o.topic());
System.out.println("Creating RDD !!!");
JavaRDD<ConsumerRecord<String, String>> r =
KafkaUtils.createRDD(streamingContext.sparkContext(), kafkaParams, offsetRanges,
LocationStrategies.PreferConsistent());
System.out.println("Count " + r.count());
//Application stuck from here onwards ...
ConsumerRecord<String, String> first = r.first();
System.out.println("First taken");
System.out.println("First value " + first.value());
}
});
System.out.println("Stream context starting ...");
// Start streaming.
streamingContext.start();
System.out.println("Stream context started ...");
try {
System.out.println("Stream context await termination ...");
streamingContext.awaitTermination();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
Sample output given below also.
Study Spark Example Starting ....
Stream context starting ...
Stream context started ...
Stream context await termination ...
RDD Empty 0
RDD Empty 0
RDD Empty 0
RDD Empty 0
RDD not empty 3
Partition Id 0
Topic my-own-topic
Creating RDD !!!

While creating sequenceFile getting ERROR nativeio.NativeIO: Unable to initialize NativeIO libraries

I am using standalone spark and while running a program which writes a sequence file using the pair RDD and I am getting below error:
ERROR nativeio.NativeIO: Unable to initialize NativeIO libraries
java.lang.NoSuchFieldError: workaroundNonThreadSafePasswdCalls
at org.apache.hadoop.io.nativeio.NativeIO.initNative(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO.<clinit>(NativeIO.java:58)
at org.apache.hadoop.fs.FileUtil.setPermission(FileUtil.java:653)
at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:509)
at org.apache.hadoop.fs.FilterFileSystem.setPermission(FilterFileSystem.java:286)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:385)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:364)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:555)
at org.apache.hadoop.io.SequenceFile$Writer.<init>(SequenceFile.java:892)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:393)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:354)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:476)
at org.apache.hadoop.mapred.SequenceFileOutputFormat.getRecordWriter(SequenceFileOutputFormat.java:58)
at org.apache.spark.SparkHadoopWriter.open(SparkHadoopWriter.scala:89)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$13.apply(PairRDDFunctions.scala:980)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$13.apply(PairRDDFunctions.scala:974)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:54)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:177)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Following is the the code that I am using:
import java.util.ArrayList;
import java.util.List;
import scala.Tuple2;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
public class BasicSaveSequenceFile {
public static class ConvertToWritableTypes implements PairFunction<Tuple2<String, Integer>, Text, IntWritable> {
public Tuple2<Text, IntWritable> call(Tuple2<String, Integer> record) {
return new Tuple2(new Text(record._1), new IntWritable(record._2));
}
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
}
String master = args[0];
String fileName = args[1];
JavaSparkContext sc = new JavaSparkContext(
master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
List<Tuple2<String, Integer>> input = new ArrayList();
input.add(new Tuple2("coffee", 1));
input.add(new Tuple2("coffee", 2));
input.add(new Tuple2("pandas", 3));
JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
}
}
I have added all the dependencies in pom.xml. Can you please help?
I upgraded the version of spark_core jar to 1.3.1 from 1.1.0 in pom.xml. Also, removed dependency on hadoop-common jar from pom.xml.
It worked for me!!!

Spark FileStreaming issue

I am trying simple file streaming example using Sparkstreaming(spark-streaming_2.10,version:1.5.1)
public class DStreamExample {
public static void main(final String[] args) {
final SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkJob");
sparkConf.setMaster("local[4]"); // for local
final JavaSparkContext sc = new JavaSparkContext(sparkConf);
final JavaStreamingContext ssc = new JavaStreamingContext(sc,
new Duration(2000));
final JavaDStream<String> lines = ssc.textFileStream("/opt/test/");
lines.print();
ssc.start();
ssc.awaitTermination();
}
}
When I run this code on single file or director it does not print anything from file, I see in logs its constantly polling but nothing is printed. I tried moving file to directory when this program was running.
Is there something I am missing? I tried applying map function on lines RDD that also does not work.
The API textFileStream is not supposed to read existing directory content, instead, it's purpose is to monitor the given Hadoop-compatible filesystem path for changes, files must be written into monitored location by "moving" them from another location within same file system.
In short, you are subscribing for directory changes and will receive the content of newly appeared files within the monitored location - in that state in which the file(s) appear at the moment of monitoring snapshot (which is 2000 ms duration in your case), and any further file updates will not reach the stream, only directory updates (new files) will do.
The way you can emulate updates is to create new file during your monitoring session:
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class DStreamExample {
public static void main(final String[] args) throws IOException {
final SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkJob");
sparkConf.setMaster("local[4]"); // for local
final JavaSparkContext sc = new JavaSparkContext(sparkConf);
final JavaStreamingContext ssc = new JavaStreamingContext(sc,
new Duration(2000));
final JavaDStream<String> lines = ssc.textFileStream("/opt/test/");
// spawn the thread which will create new file within the monitored directory soon
Runnable r = () -> {
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
e.printStackTrace();
}
try {
FileUtils.write(new File("/opt/test/newfile1"), "whatever");
} catch (IOException e) {
e.printStackTrace();
}
};
new Thread(r).start();
lines.foreachRDD((Function<JavaRDD<String>, Void>) rdd -> {
List<String> lines1 = rdd.collect();
lines1.stream().forEach(l -> System.out.println(l));
return null;
});
ssc.start();
ssc.awaitTermination();
}
}

JavaStreamingContext nullpointer exception while fetching data from Cassandra

I want to read file data and check if file line data is present in Cassandra if it's present then needs to merge otherwise fresh insert to C*.
File data just contains name,address in json format, in Cassandra student table have UUID as primary key and there is secondry index on name
Once data is merged to cassandra I want to send new UUID or existing UUID to KAfka.
When I run on locally or single machine on mesos cluster(keeping line sparkConf.setMaster("local[4]");) this program works but when I submit to mesos master with 4 slaves(commenting line //sparkConf.setMaster("local[4]"); on cluster) there is nullpointer while selecting data from Cassandra on javastreaming context
I made streaming context static as earliar it was throwing serialization exception as it was getting accessed inside map transformation for file dstream.
Is there something wrong with the approach or ? is it because I am trying build Cassandra RDD withing DStream map tranformation which causing issue?
import kafka.producer.KeyedMessage;
import com.datastax.spark.connector.japi.CassandraStreamingJavaUtil;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.util.Properties;
import java.util.UUID;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.cloudera.spark.streaming.kafka.JavaDStreamKafkaWriter;
import org.cloudera.spark.streaming.kafka.JavaDStreamKafkaWriterFactory;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapRowTo;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;
public class DStreamExample {
public DStreamExample() {
}
private static JavaStreamingContext ssc;
public static void main(final String[] args) {
final SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkJob");
sparkConf.setMaster("local[4]"); // for local
sparkConf.set("spark.cassandra.connection.host", cassandra_hosts);
ssc = new JavaStreamingContext(sparkConf,new Duration(2000));
final JavaDStream<Student> studentFileDStream = ssc.textFileStream(
"/usr/local/fileDir/").map(line -> {
final Gson gson = new Gson();
final JsonParser parser = new JsonParser();
final JsonObject jsonObject = parser.parse(line)
.getAsJsonObject();
// generating new UUID
studentFile.setId(UUID.randomUUID());
final Student studentFile = gson.fromJson(jsonObject,
Student.class);
try{
//NullPointer at this line while running on cluster
final JavaRDD<Student> cassandraStudentRDD =
CassandraStreamingJavaUtil.javaFunctions(ssc)
.cassandraTable("keyspace", "student",
mapRowTo(Student.class)).where("name=?",
studentFile.getName());
//If student name is found in cassandra table then assign UUID to fileStudent object
//This way i wont create multiple records for same name student
final Student studentCassandra = cassandraStudentRDD.first();
studentFile.setId(studentCassandra.getId());
}catch(Exception e){
}
return studentFile;
});
//Save student to Cassandra
CassandraStreamingJavaUtil.javaFunctions(studentFileDStream)
.writerBuilder("keyspace", "student", mapToRow(Student.class))
.saveToCassandra();
final JavaDStreamKafkaWriter<Student> writer =
JavaDStreamKafkaWriterFactory.fromJavaDStream(studentFileDStream);
final Properties properties = new Properties();
properties.put("metadata.broker.list", "server:9092");
properties.put("serializer.class", "kafka.serializer.StringEncoder");
//Just send studnet UUID_PUT to kafka
writer.writeToKafka(properties,
student ->
new KeyedMessage<>("TOPICNAME", student.getId() + "_PUT"));
ssc.start();
ssc.awaitTermination();
}
}
class Student {
private String address;
private UUID id;
private String name;
public Student() {
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public UUID getId() {
return id;
}
public void setId(UUID id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
Exception Stacktrac::
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, servername): java.lang.NullPointerException
at com.datastax.spark.connector.japi.CassandraStreamingJavaUtil.javaFunctions(CassandraStreamingJavaUtil.java:39)
at com.ebates.ps.batch.sparkpoc.DStreamPOCExample.lambda$main$d2c4cc2c$1(DStreamPOCExample.java:109)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.cloudera.spark.streaming.kafka.RDDKafkaWriter$$anonfun$writeToKafka$1.apply(RDDKafkaWriter.scala:47)
at org.cloudera.spark.streaming.kafka.RDDKafkaWriter$$anonfun$writeToKafka$1.apply(RDDKafkaWriter.scala:45)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:898)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:898)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1822)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1835)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1848)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:898)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:896)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:896)

Resources