Spark GraphX memory out of error SparkListenerBus (java.lang.OutOfMemoryError: Java heap space) - apache-spark

I have problem with out of memory on Apache Spark (Graphx). Application run, but after some time shutdown. I use Spark 1.2.0. Cluster has enough memory a number of cores. Other application where I am not using GraphX, run without problem. Application use Pregel.
I submit application in Hadoop YARN mode:
HADOOP_CONF_DIR=/etc/hadoop/conf spark-submit --class DPFile --deploy-mode cluster --master yarn --num-executors 4 --driver-memory 10g --executor-memory 6g --executor-cores 8 --files log4j.properties spark_routing_2.10-1.0.jar road_cr_big2 1000
Spark configuration:
val conf = new SparkConf(true)
.set("spark.eventLog.overwrite", "true")
.set("spark.driver.extraJavaOptions", "-Dlog4j.configuration=log4j.properties")
.set("spark.yarn.applicationMaster.waitTries", "60")
.set("yarn.log-aggregation-enable","true")
.set("spark.akka.frameSize", "500")
.set("spark.akka.askTimeout", "600")
.set("spark.core.connection.ack.wait.timeout", "600")
.set("spark.akka.timeout","1000")
.set("spark.akka.heartbeat.pauses","60000")
.set("spark.akka.failure-detector.threshold","3000.0")
.set("spark.akka.heartbeat.interval","10000")
.set("spark.ui.retainedStages","100")
.set("spark.ui.retainedJobs","100")
.set("spark.driver.maxResultSize","4G")
Thank you for answers.
Log:
ERROR Utils: Uncaught exception in thread SparkListenerBus
java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:2367)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:114)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415)
at java.lang.StringBuilder.append(StringBuilder.java:132)
at scala.collection.mutable.StringBuilder.append(StringBuilder.scala:197)
at org.apache.spark.util.FileLogger.logLine(FileLogger.scala:192)
at org.apache.spark.scheduler.EventLoggingListener.logEvent(EventLoggingListener.scala:88)
at org.apache.spark.scheduler.EventLoggingListener.onJobStart(EventLoggingListener.scala:113)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$postToAll$3.apply(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$postToAll$3.apply(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$foreachListener$1.apply(SparkListenerBus.scala:83)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$foreachListener$1.apply(SparkListenerBus.scala:81)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.SparkListenerBus$class.foreachListener(SparkListenerBus.scala:81)
at org.apache.spark.scheduler.SparkListenerBus$class.postToAll(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.LiveListenerBus.postToAll(LiveListenerBus.scala:32)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:56)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:56)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(LiveListenerBus.scala:56)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply(LiveListenerBus.scala:47)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply(LiveListenerBus.scala:47)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1468)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1.run(LiveListenerBus.scala:46)
Exception in thread "SparkListenerBus" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:2367)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:114)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415)
at java.lang.StringBuilder.append(StringBuilder.java:132)
at scala.collection.mutable.StringBuilder.append(StringBuilder.scala:197)
at org.apache.spark.util.FileLogger.logLine(FileLogger.scala:192)
at org.apache.spark.scheduler.EventLoggingListener.logEvent(EventLoggingListener.scala:88)
at org.apache.spark.scheduler.EventLoggingListener.onJobStart(EventLoggingListener.scala:113)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$postToAll$3.apply(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$postToAll$3.apply(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$foreachListener$1.apply(SparkListenerBus.scala:83)
at org.apache.spark.scheduler.SparkListenerBus$$anonfun$foreachListener$1.apply(SparkListenerBus.scala:81)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.SparkListenerBus$class.foreachListener(SparkListenerBus.scala:81)
at org.apache.spark.scheduler.SparkListenerBus$class.postToAll(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.LiveListenerBus.postToAll(LiveListenerBus.scala:32)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:56)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:56)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(LiveListenerBus.scala:56)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply(LiveListenerBus.scala:47)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply(LiveListenerBus.scala:47)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1468)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1.run(LiveListenerBus.scala:46)
ERROR LiveListenerBus: SparkListenerBus thread is dead! This means SparkListenerEvents have notbeen (and will no longer be) propagated to listeners for some time.
ERROR ApplicationMaster: RECEIVED SIGNAL 15: SIGTERM

Related

Spark app failing with error org.apache.spark.shuffle.FetchFailedException

I am running spark 2.4.0 on EMR. I am trying to process huge data (1TB) on EMR using 100 NODES with memory 122G and 16core each. i am getting below exceptions after sometime. Here are the parameters I've set.
--executor-memory 80g
--exeuctor-cores 4
--driver-memory 80g
--driver-cores 1
spark = (SparkSession
.builder
.master("yarn")
.config("spark.shuffle.service.enabled","true")
.config("spark.dynamicAllocation.shuffleTracking.enabled","true")
.config("spark.dynamicAllocation.enabled", "true")
.config("spark.dynamicAllocation.minExecutors","50")
#.config("spark.dynamicAllocation.maxExecutors", "500")
.config("spark.dynamicAllocation.executorIdleTimeout","2m")
.config("spark.driver.maxResultSize", "16g")
.config("spark.kryoserializer.buffer.max", "2047")
.config("spark.rpc.message.maxSize", "2047")
.config("spark.memory.offHeap.enabled","true")
.config("spark.memory.offHeap.size","50g")
.config("spark.sql.autoBroadcastJoinThreshold", "-1")
.config("spark.sql.broadcastTimeout","1200")
.config("spark.sql.shuffle.partitions","200")
.config("spark.memory.storageFraction","0.3")
.config("spark.yarn.executor.memoryOverhead","2g")
.enableHiveSupport()
.getOrCreate())
Here is the three type of executor failures I've been getting and eventually causing the corresponding Stage to rerun. Sometimes the rerun succeeds and sometimes they go on retrying forever.
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 8
org.apache.spark.shuffle.FetchFailedException: Failure while fetching StreamChunkId{streamId=45963765394, chunkIndex=0}: java.lang.RuntimeException: Executor is not registered (appId=application_1625085506598_0885, execId=137)
org.apache.spark.shuffle.FetchFailedException: Failed to connect to ip-10-40-6-235.ap-south-1.compute.internal/10.40.6.235:7337
attaching the screenshot of spark dag

Spark YARN: Cannot allocate a page with more than 17179869176 bytes

I am joining 11Mn records. I am running with 5 workers in EMR Cluster Spark 2.2.1
I am getting the following error while running the job:
executor 3): java.lang.IllegalArgumentException: Cannot allocate a page with more than 17179869176 bytes
at org.apache.spark.memory.TaskMemoryManager.allocatePage(TaskMemoryManager.java:277)
at org.apache.spark.memory.MemoryConsumer.allocateArray(MemoryConsumer.java:90)
at org.apache.spark.shuffle.sort.ShuffleExternalSorter.growPointerArrayIfNecessary(ShuffleExternalSorter.java:328)
at org.apache.spark.shuffle.sort.ShuffleExternalSorter.insertRecord(ShuffleExternalSorter.java:379)
at org.apache.spark.shuffle.sort.UnsafeShuffleWriter.insertRecordIntoSorter(UnsafeShuffleWriter.java:246)
at org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:167)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I am not able to understand the possible reason for this. Please help me with what parameter should I set.
Currently I am running with the following arguments: --num-executors 5 --conf spark.eventLog.enabled=true --executor-memory 70g --driver-memory 30g --executor-cores 16 --conf spark.shuffle.memoryFraction=0.5

Spark job fails due to stackoverflow error

My spark job is using Mllib to train a LogisticRegression on a data but it fails due to the Stackoverflow error, here is the error message shown in the spark-shell
java.lang.StackOverflowError
at scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:48)
at scala.collection.generic.Growable$$anonfun$$plus$plus$eq$1.apply(Growable.scala:48)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:176)
at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:45)
at scala.collection.generic.GenericCompanion.apply(GenericCompanion.scala:48)
...
when I check the Spark UI, there is no failed stage or job! This is how I run my spark-shell
spark-shell --num-executors 100 --driver-memory 20g --conf spark.driver.maxResultSize=5g --executor-memory 8g --executor-cores 3
I even tried to increase the size of the stack by adding the following line when running the spark-shell, but it didn't help
--conf "spark.driver.extraJavaOptions='-XX:ThreadStackSize=81920'"
What the issue is?

Spark heapspace error while running Python program

When I running Python code in Spark using
spark-submit --master local --packages com.databricks:spark-xml_2.10:0.4.1 \
--driver-memory 8G --executor-memory 7G
I get this error
17/02/28 18:59:25 ERROR util.Utils: Uncaught exception in thread stdout writer for /usr/local/bin/python2.7 java.lang.OutOfMemoryError: Java heap space
I get the same error when using
spark.yarn.executor.memoryOverhead=1024M
I have 32 GB of RAM and Java options are 4 GB.
How can I fix this?

Spark Streaming - java.io.IOException: Lease timeout of 0 seconds expired

I have spark streaming application using checkpoint writing on HDFS.
Has anyone know the solution?
Previously we were using the kinit to specify principal and keytab and got the suggestion to specify these via spark-submit command instead kinit but still this error and cause spark streaming application down.
spark-submit --principal sparkuser#HADOOP.ABC.COM --keytab /home/sparkuser/keytab/sparkuser.keytab --name MyStreamingApp --master yarn-cluster --conf "spark.driver.extraJavaOptions=-XX:+UseConcMarkSweepGC --conf "spark.eventLog.enabled=true" --conf "spark.streaming.backpressure.enabled=true" --conf "spark.streaming.stopGracefullyOnShutdown=true" --conf "spark.executor.extraJavaOptions=-XX:+UseConcMarkSweepGC --class com.abc.DataProcessor myapp.jar
I see multiple occurrences of following exception in logs and finally SIGTERM 15 that kills the executor and driver. We are using CDH 5.5.2
2016-10-02 23:59:50 ERROR SparkListenerBus LiveListenerBus:96 -
Listener EventLoggingListener threw an exception
java.lang.reflect.InvocationTargetException
at sun.reflect.GeneratedMethodAccessor8.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.scheduler.EventLoggingListener$$anonfun$logEvent$3.apply(EventLoggingListener.scala:148)
at org.apache.spark.scheduler.EventLoggingListener$$anonfun$logEvent$3.apply(EventLoggingListener.scala:148)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.EventLoggingListener.logEvent(EventLoggingListener.scala:148)
at org.apache.spark.scheduler.EventLoggingListener.onUnpersistRDD(EventLoggingListener.scala:184)
at org.apache.spark.scheduler.SparkListenerBus$class.onPostEvent(SparkListenerBus.scala:50)
at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:56)
at org.apache.spark.util.AsynchronousListenerBus.postToAll(AsynchronousListenerBus.scala:37)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(AsynchronousListenerBus.scala:79)
at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1135)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1.run(AsynchronousListenerBus.scala:63)
Caused by: java.io.IOException: Lease timeout of 0 seconds expired.
at org.apache.hadoop.hdfs.DFSOutputStream.abort(DFSOutputStream.java:2370)
at org.apache.hadoop.hdfs.DFSClient.closeAllFilesBeingWritten(DFSClient.java:964)
at org.apache.hadoop.hdfs.DFSClient.renewLease(DFSClient.java:932)
at org.apache.hadoop.hdfs.LeaseRenewer.renew(LeaseRenewer.java:423)
at org.apache.hadoop.hdfs.LeaseRenewer.run(LeaseRenewer.java:448)
at org.apache.hadoop.hdfs.LeaseRenewer.access$700(LeaseRenewer.java:71)
at org.apache.hadoop.hdfs.LeaseRenewer$1.run(LeaseRenewer.java:304)
at java.lang.Thread.run(Thread.java:745)

Resources