java.lang.IllegalStateException: Adding new inputs, transformations, and output operations after starting a context is not supported - apache-spark

I get below exception when I try to create a dStream within a Function call of Spark.
My call method :
#Override
public JavaRDD<Object> call(JavaRDD<Object> v1) throws Exception {
Queue<JavaRDD<Object>> queue = new LinkedList<>();
queue.add(v1);
JavaDStream<Object> dStream = context.queueStream(queue);
JavaDStream<Object> newDStream = dStream.map(AbstractProcessor.this);
final JavaRDD<Object> rdd = context.sparkContext().emptyRDD();
newDStream.foreachRDD(new SaxFunction<JavaRDD<Object>, Void>() {
private static final long serialVersionUID = 672054140484217234L;
#Override
public Void execute(JavaRDD<Object> object) throws Exception {
rdd.union(object);
return null;
}
});
return rdd;
}
Exception :
Caused by: java.lang.IllegalStateException: Adding new inputs, transformations, and output operations after starting a context is not supported
at org.apache.spark.streaming.dstream.DStream.validateAtInit(DStream.scala:220)
at org.apache.spark.streaming.dstream.DStream.<init>(DStream.scala:64)
at org.apache.spark.streaming.dstream.InputDStream.<init>(InputDStream.scala:42)
at org.apache.spark.streaming.dstream.QueueInputDStream.<init>(QueueInputDStream.scala:29)
at org.apache.spark.streaming.StreamingContext.queueStream(StreamingContext.scala:513)
at org.apache.spark.streaming.StreamingContext.queueStream(StreamingContext.scala:492)
at org.apache.spark.streaming.api.java.JavaStreamingContext.queueStream(JavaStreamingContext.scala:436)
Is there any way I can create a dStream and do operations on it in runtime or I can update DAG after context is started?
Thanks in advance.

Related

Abort the driver immediately when any of the executor fails

while loading data into database if particular column is a bad record
If one executor fails then driver needs to be updated with the message and job has to be terminated
I thought to do using accumulator.Please give me suggestion of how to do this.....
Attached my code below...
public static void main(String[] args) {
SparkSession spark= SparkSession.builder().appName("loadSqlData").master("local[*]").getOrCreate();
Properties connectionProperties= new Properties();
connectionProperties.put("user","postgres");
connectionProperties.put("password","root");
Dataset<Row> personcsvdata = spark.read().option("header","true").csv("C:\\Users\\Manasa\\Documents\\nulldata.csv");
personcsvdata.show();
LongAccumulator countErrors = spark.sparkContext().longAccumulator();
try {
personcsvdata.write().mode(SaveMode.Append).jdbc("jdbc:postgresql://localhost:5432/postgres", "public.employee", connectionProperties);
countErrors.add(1);
}
catch (Exception e) {
}
}

Scope 'job' is not active for the current thread, No context holder available for job scope Spring-Batch

In my Spring batch job, I'm trying to share data between steps using JobExecutionContext, which works only if i keep the steps single threaded as follows:
#EnableTask
#EnableBatchProcessing
#Configuration
#PropertySource(value = {"classpath:application.properties"})
public class Config{
private static final HashMap<String,Object> OVERRIDDEN_BY_EXPRESSION = null;
private static final String QUERY = "SELECT * FROM \"Config\"";
#Autowired
public JobBuilderFactory jobBuilderFactory;
#Autowired
public StepBuilderFactory stepBuilderFactory;
#Autowired
private MongoTemplate mongoTemplate;
#Autowired
EntityManager em;
#Autowired
DataSource dataSource;
/*Config Step*/
#Bean
public JdbcCursorItemReader<BatchConfig> configReader(DataSource dataSource) {
JdbcCursorItemReader<BatchConfig> config = new JdbcCursorItemReader<>();
config.setDataSource(dataSource);
config.setSql(QUERY);
config.setRowMapper(new BatchRowMapper());
return config;
}
#Bean
public ItemWriter<BatchConfig> itemWriter() {
return new ItemWriter<BatchConfig>() {
private StepExecution stepExecution;
#Override
public void write(List<? extends BatchConfig> items) {
ExecutionContext stepContext = this.stepExecution.getExecutionContext();
for (BatchConfig item : items) {
HashMap<String, Object> table = new HashMap<>();
table.put("date", item.getDate_time());
table.put("size", item.getSize());
System.out.println(table);
stepContext.put(item.getName(), table);
}
}
#BeforeStep
public void saveStepExecution(StepExecution stepExecution) {
this.stepExecution = stepExecution;
}
};
}
#Bean
public Step stepConfig(JdbcCursorItemReader<BatchConfig> configReader) throws Exception {
return stepBuilderFactory.get("stepConfig")
.<BatchConfig, BatchConfig>chunk(10)
.reader(configReader)
.writer(itemWriter())
.listener(promotionListener())
.build();
}
#Bean
public ExecutionContextPromotionListener promotionListener() {
ExecutionContextPromotionListener listener = new ExecutionContextPromotionListener();
listener.setKeys(new String[] {"COUNTRY", "CATEGORY", "USER"});
return listener;
}
/*Country Step*/
#JobScope
#Bean
public MongoItemReader<COUNTRY> CountryItemReader(#Value("#{jobExecutionContext['COUNTRY']}") HashMap<String, Object> table) {
int date = (int) table.get("date");
MongoItemReader<COUNTRY> reader = new MongoItemReader<COUNTRY>();
reader.setTemplate(mongoTemplate);
reader.setTargetType(COUNTRY.class);
reader.setCollection("COUNTRY");
reader.setFields("{\"COUNTRY_NAME\": 1,\"SHORT_NAME\": 1,\"DEPT_CODE\": 1}");
reader.setSort(new HashMap<String, Sort.Direction>() {{
put("_id", Sort.Direction.DESC);
}});
reader.setQuery("{DATE_TIME: {$gt:"+date+"}}");
reader.setPageSize(250);
return reader;
}
#Bean
public CountryItemProcessor CountryProcessor(){
return new CountryItemProcessor();
}
#Bean
public JpaItemWriter<COUNTRY> country_writer(){
JpaItemWriter<COUNTRY> jpa = new JpaItemWriter<COUNTRY>();
jpa.setEntityManagerFactory(em.getEntityManagerFactory());
return jpa;
}
#JobScope
#Bean
public Step step1(#Value("#{jobExecutionContext['COUNTRY']}") HashMap<String, Object> tab) {
int size = (int) tab.get("size");
//System.out.println(size);
return stepBuilderFactory.get("step1")
.<COUNTRY, COUNTRY>chunk(20)
.reader(CountryItemReader(OVERRIDDEN_BY_EXPRESSION))
.writer(country_writer())
.build();
}
#Bean
public Job TestJob(Step stepConfig) throws Exception {
return this.jobBuilderFactory.get("TestJob")
.incrementer(new RunIdIncrementer())// because a spring config bug, this incrementer is not really useful
.start(stepConfig)
.next(step1(OVERRIDDEN_BY_EXPRESSION))
.build();
}
}
However when adding SimpleAsyncTaskExecutor an error occured:
org.springframework.beans.factory.support.ScopeNotActiveException: Error creating bean with name 'scopedTarget.CountryItemReader': Scope 'job' is not active for the current thread; consider defining a scoped proxy for this bean if you intend to refer to it from a singleton; nested exception is java.lang.IllegalStateException: No context holder available for job scope
at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:383) ~[spring-beans-5.3.6.jar:5.3.6]
at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:208) ~[spring-beans-5.3.6.jar:5.3.6]
at org.springframework.aop.target.SimpleBeanTargetSource.getTarget(SimpleBeanTargetSource.java:35) ~[spring-aop-5.3.6.jar:5.3.6]
at org.springframework.aop.framework.CglibAopProxy$DynamicAdvisedInterceptor.intercept(CglibAopProxy.java:676) ~[spring-aop-5.3.6.jar:5.3.6]
at org.springframework.batch.item.data.MongoItemReader$$EnhancerBySpringCGLIB$$67443e4.read(<generated>) ~[spring-batch-infrastructure-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.item.SimpleChunkProvider.doRead(SimpleChunkProvider.java:99) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.item.SimpleChunkProvider.read(SimpleChunkProvider.java:180) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.item.SimpleChunkProvider$1.doInIteration(SimpleChunkProvider.java:126) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.repeat.support.RepeatTemplate.getNextResult(RepeatTemplate.java:375) ~[spring-batch-infrastructure-4.3.2.jar:4.3.2]
at org.springframework.batch.repeat.support.RepeatTemplate.executeInternal(RepeatTemplate.java:215) ~[spring-batch-infrastructure-4.3.2.jar:4.3.2]
at org.springframework.batch.repeat.support.RepeatTemplate.iterate(RepeatTemplate.java:145) ~[spring-batch-infrastructure-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.item.SimpleChunkProvider.provide(SimpleChunkProvider.java:118) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.item.ChunkOrientedTasklet.execute(ChunkOrientedTasklet.java:71) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.tasklet.TaskletStep$ChunkTransactionCallback.doInTransaction(TaskletStep.java:407) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.step.tasklet.TaskletStep$ChunkTransactionCallback.doInTransaction(TaskletStep.java:331) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.transaction.support.TransactionTemplate.execute(TransactionTemplate.java:140) ~[spring-tx-5.3.6.jar:5.3.6]
at org.springframework.batch.core.step.tasklet.TaskletStep$2.doInChunkContext(TaskletStep.java:273) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.scope.context.StepContextRepeatCallback.doInIteration(StepContextRepeatCallback.java:82) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.repeat.support.TaskExecutorRepeatTemplate$ExecutingRunnable.run(TaskExecutorRepeatTemplate.java:262) ~[spring-batch-infrastructure-4.3.2.jar:4.3.2]
at java.base/java.lang.Thread.run(Thread.java:829) ~[na:na]
Caused by: java.lang.IllegalStateException: No context holder available for job scope
at org.springframework.batch.core.scope.JobScope.getContext(JobScope.java:159) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.batch.core.scope.JobScope.get(JobScope.java:92) ~[spring-batch-core-4.3.2.jar:4.3.2]
at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:371) ~[spring-beans-5.3.6.jar:5.3.6]
I tried solving this issue like in:
https://github.com/spring-projects/spring-batch/issues/1335, but it seems like it is using just one thread in addition to the main.
Is there any way to resolve this issue without adding tweaked code ?
I'm planning to scale the job using remote partitionning on Kubernetes, would this issue persist because of job scope?
Any thoughts or advice are more than welcome.
I'm trying to share data between steps using JobExecutionContext, which works only if i keep the steps single threaded
Relying on the execution context to share data between multi-threaded steps is incorrect, because the keys will be overridden by concurrent threads. The reference documentation explicitly mentions to turn off state management in multi-threaded environment:
Javadoc: remember to use saveState=false if used in a multi-threaded client
Reference doc: it is not recommended to use job-scoped beans in multi-threaded or partitioned steps
That said, I don't see what key could be shared from a multi-threaded step to the next step (as threads are executed in parallel), but if you really need to do that, you should use another method like defining a shared bean that is thread safe.

Apache Spark -- Data Grouping and Execution in worker nodes

We are getting live machine data as json and we get this data from RabbitMQ. below is a sample of the json,
{"DeviceId":"MAC-1001","DeviceType":"Sim-1","TimeStamp":"05-12-2017 10:25:35","data":{"Rate":10,"speed":2493,"Mode":1,"EMode":2,"Run":1}}
{"DeviceId":"MAC-1001","DeviceType":"Sim-1","TimeStamp":"05-12-2017 10:25:36","data":{"Rate":10,"speed":2493,"Mode":1,"EMode":2,"Run":1}}
{"DeviceId":"MAC-1002","DeviceType":"Sim-1","TimeStamp":"05-12-2017 10:25:37","data":{"Rate":10,"speed":2493,"Mode":1,"EMode":2,"Run":1}}
{"DeviceId":"MAC-1002","DeviceType":"Sim-1","TimeStamp":"05-12-2017 10:25:38","data":{"Rate":10,"speed":2493,"Mode":1,"EMode":2,"Run":1}}
The data is windowed for duration of 'X' minutes and then below is what we want to achieve
Group the data by deviceId, this is done but not sure if we can get a DataSet
We want to loop through the above grouped data and execute for aggregation logic for each device using the foreachPartition so that the code is executed within worker nodes.
Please correct me if my thought process is wrong here.
Our earlier code was collecting the data,looping through the RDD's,convert them to DataSet and applying aggregation logic on the DataSet using Spark SqlContext api's.
When doing load testing we saw 90% of the processing was happening in Master node and after a while the cpu usage spiked to 100% and the process bombed out.
So we are now trying to re-engineer the whole process to execute maximum of logic in worker nodes.
Below is the code so far that actually works in worker node but we are yet to get a DataSet for aggregating Logic
public static void main(String[] args) {
try {
mconf = new SparkConf();
mconf.setAppName("OnPrem");
mconf.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(mconf);
jssc = new JavaStreamingContext(sc, Durations.seconds(60));
SparkSession spksess = SparkSession.builder().appName("Onprem").getOrCreate();
//spksess.sparkContext().setLogLevel("ERROR");
Map<String, String> rabbitMqConParams = new HashMap<String, String>();
rabbitMqConParams.put("hosts", "localhost");
rabbitMqConParams.put("userName", "guest");
rabbitMqConParams.put("password", "guest");
rabbitMqConParams.put("vHost", "/");
rabbitMqConParams.put("durable", "true");
List<JavaRabbitMQDistributedKey> distributedKeys = new LinkedList<JavaRabbitMQDistributedKey>();
distributedKeys.add(new JavaRabbitMQDistributedKey(QUEUE_NAME, new ExchangeAndRouting(EXCHANGE_NAME, "fanout", ""), rabbitMqConParams));
Function<Delivery, String> messageHandler = new Function<Delivery, String>() {
public String call(Delivery message) {
return new String(message.getBody());
}
};
JavaInputDStream<String> messages = RabbitMQUtils.createJavaDistributedStream(jssc, String.class, distributedKeys, rabbitMqConParams, messageHandler);
JavaDStream<String> machineDataRDD = messages.window(Durations.minutes(2),Durations.seconds(60)); //every 60 seconds one RDD is Created
machineDataRDD.print();
JavaPairDStream<String, String> pairedData = machineDataRDD.mapToPair(s -> new Tuple2<String, String>(getMap(s).get("DeviceId").toString(), s));
JavaPairDStream<String, Iterable<String>> groupedData = pairedData.groupByKey();
groupedData.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<String>>>(){
#Override
public void call(JavaPairRDD<String, Iterable<String>> data) throws Exception {
data.foreachPartition(new VoidFunction<Iterator<Tuple2<String,Iterable<String>>>>(){
#Override
public void call(Iterator<Tuple2<String, Iterable<String>>> data) throws Exception {
while(data.hasNext()){
LOGGER.error("Machine Data == >>"+data.next());
}
}
});
}
});
jssc.start();
jssc.awaitTermination();
}
catch (Exception e)
{
e.printStackTrace();
}
The below grouping code gives us a Iterable of string for a Device , ideally we would like to get a DataSet
JavaPairDStream<String, String> pairedData = machineDataRDD.mapToPair(s -> new Tuple2<String, String>(getMap(s).get("DeviceId").toString(), s));
JavaPairDStream<String, Iterable<String>> groupedData = pairedData.groupByKey();
Important thing for me is the looping using foreachPartition so that code executing gets pushed to Worker Nodes.
After looking through more code samples and guidelines sqlcontext , sparksession are not serialized and available on the worker nodes , so we will be changing the strategy of not trying to build a dataset withing foreachpartition loop.

Increment column value in Spark

I've a Spark Streaming object that fetches data from RabbitMQ and saves it into HBase. This save is an Increment operation. I'm using the saveAsNewAPIHadoopDataset but I keep getting below Exception
Code:
pairDStream.foreachRDD(new VoidFunction<JavaPairRDD<String, Integer>>() {
#Override
public void call(JavaPairRDD<String, Integer> arg0)
throws Exception {
Configuration dbConf = HBaseConfiguration.create();
dbConf.set("hbase.table.namespace.mappings", "tablename:/mapr/tablename");
Job jobConf = Job.getInstance(dbConf);
jobConf.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "tablename");
jobConf.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);
JavaPairRDD<ImmutableBytesWritable, Increment> hbasePuts = arg0.mapToPair(
new PairFunction<Tuple2<String,Integer>, ImmutableBytesWritable, Increment>() {
#Override
public Tuple2<ImmutableBytesWritable, Increment> call(
Tuple2<String, Integer> arg0)
throws Exception {
String[] keys = arg0._1.split("_");
Increment inc = new Increment(Bytes.toBytes(keys[0]));
inc.addColumn(Bytes.toBytes("data"),
Bytes.toBytes(keys[1]),
arg0._2);
return new Tuple2<ImmutableBytesWritable, Increment>(new ImmutableBytesWritable(), inc);
}
});
// save to HBase- Spark built-in API method
hbasePuts.saveAsNewAPIHadoopDataset(jobConf.getConfiguration());
}
});
Exception:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 6.0 failed 4 times, most recent failure: Lost task 1.3 in stage 6.0 (TID 100, dev-arc-app036.vega.cloud.ironport.com): java.io.IOException: Pass a Delete or a Put
at org.apache.hadoop.hbase.mapreduce.TableOutputFormat$TableRecordWriter.write(TableOutputFormat.java:128)
at org.apache.hadoop.hbase.mapreduce.TableOutputFormat$TableRecordWriter.write(TableOutputFormat.java:87)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply$mcV$sp(PairRDDFunctions.scala:1113)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1250)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1119)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1091)
Is it possible to use "saveAsNewAPIHadoopDataset" method to Increment rather than Put?
Any help is greatly appreciated.
Thanks
Akhila.

Large data processing using Spring Batch Multi-threaded Step and RepositoryItemWriter/ RepositoryItemReader

I am trying to write a batch processing application using spring batch with multi-thread step.this is simple application reading data from a table and writing to another table but data is large around 2 million record .
I am using RepositoryItemReader & RepositoryItemWriter for reading and writing data. But after processing some data it failing due to Unable to acquire JDBC Connection.
//Config.Java
#Bean
public TaskExecutor taskExecutor() {
SimpleAsyncTaskExecutor taskExecutor = new SimpleAsyncTaskExecutor();
taskExecutor.setConcurrencyLimit(10);
return taskExecutor;
}
#Bean(name = "personJob")
public Job personKeeperJob() {
Step step = stepBuilderFactory.get("step-1")
.<User, Person> chunk(1000)
.reader(userReader)
.processor(jpaProcessor)
.writer(personWriter)
.taskExecutor(taskExecutor())
.throttleLimit(10)
.build();
Job job = jobBuilderFactory.get("person-job")
.incrementer(new RunIdIncrementer())
.listener(this)
.start(step)
.build();
return job;
}
//Processor.Java
#Override
public Person process(User user) throws Exception {
Optional<User> userFromDb = userRepo.findById(user.getUserId());
Person person = new Person();
if(userFromDb.isPresent()) {
person.setName(userFromDb.get().getName());
person.setUserId(userFromDb.get().getUserId());
person.setDept(userFromDb.get().getDept());
}
return person;
}
//Reader.Java
#Autowired
public UserItemReader(final UserRepository repository) {
super();
this.repository = repository;
}
#PostConstruct
protected void init() {
final Map<String, Sort.Direction> sorts = new HashMap<>();
sorts.put("userId", Direction.ASC);
this.setRepository(this.repository);
this.setSort(sorts);
this.setMethodName("findAll");
}
//Writer.Java
#PostConstruct
protected void init() {
this.setRepository(repository);
}
#Transactional
public void write(List<? extends Person> persons) throws Exception {
repository.saveAll(persons);
}
application.properties
# Datasource
spring.datasource.platform=h2
spring.datasource.url=jdbc:h2:mem:batchdb
spring.main.allow-bean-definition-overriding=true
spring.datasource.hikari.maximum-pool-size=500
Error :
org.springframework.transaction.CannotCreateTransactionException: Could not open JPA EntityManager for transaction; nested exception is org.hibernate.exception.JDBCConnectionException: Unable to acquire JDBC Connection
at org.springframework.orm.jpa.JpaTransactionManager.doBegin(JpaTransactionManager.java:447)
......................
Caused by: org.hibernate.exception.JDBCConnectionException: Unable to acquire JDBC Connection
at org.hibernate.exception.internal.SQLExceptionTypeDelegate.convert(SQLExceptionTypeDelegate.java:48)
............................
Caused by: java.sql.SQLTransientConnectionException: HikariPool-1 - Connection is not available, request timed out after 30927ms.
You run out of connections.
Try to set the Hikari Connection Pool to a bigger number:
spring.datasource.hikari.maximum-pool-size=20

Resources