Getting error - py4j.protocol.Py4JJavaError: An error occurred while calling o115.save - apache-spark

Have a pyspark code which is running succesfully as per scehdule every month. It failed this month; we have not made any changes to the code or the configuration. Have looked at all the similar threads but one seems to help.
Any suggestions are greatly appreciated!
The complete traceback is as -
'''
Significant Credit Monthly starts at 2020-12-04 17:55:04.761266
Traceback (most recent call last):
File "/data/code/sig_cr_dc/Sig_CD_Monthly1.py", line 105, in <module>
Sig_Cdt_Stats.write.mode("overwrite").save(Sig_Cdt_Path)
File "/var/opt/teradata/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p2245.2467/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 397, in save
File "/var/opt/teradata/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p2245.2467/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
File "/var/opt/teradata/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p2245.2467/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 45, in deco
File "/var/opt/teradata/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p2245.2467/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o115.save.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelation.scala:154)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply(InsertIntoHadoopFsRelation.scala:106)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply(InsertIntoHadoopFsRelation.scala:106)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:53)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation.run(InsertIntoHadoopFsRelation.scala:106)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:58)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:70)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:56)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:56)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:256)
at org.apache.spark.sql.DataFrameWriter.dataSource$lzycompute$1(DataFrameWriter.scala:181)
at org.apache.spark.sql.DataFrameWriter.org$apache$spark$sql$DataFrameWriter$$dataSource$1(DataFrameWriter.scala:181)
at org.apache.spark.sql.DataFrameWriter$$anonfun$save$1.apply$mcV$sp(DataFrameWriter.scala:188)
at org.apache.spark.sql.DataFrameWriter.executeAndCallQEListener(DataFrameWriter.scala:154)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:188)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:172)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 164 in stage 9.0 failed 4 times, most recent failure: Lost task 164.3 in stage 9.0 (TID 18018, kmblhdpdata2.kbank.kotakgroup.com, executor 149): ExecutorLostFailure (executor 149 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 1.5 GB of 1.5 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1433)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1421)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1420)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1420)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1644)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1603)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1592)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1862)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1875)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1952)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelation.scala:148)
... 31 more
'''

Related

Errors while trying to use collect() method in pyspark. (Windows 10) [duplicate]

This question already has answers here:
Python worker failed to connect back
(9 answers)
Closed last year.
So I've read a dozens of tutorials on how to set up pyspark.
I've set all enviremental variables like HADOOP_HOME, SPARK_HOME e.t.c.
I've downloaded winutils and put it to %SPARK_HOME%/bin.
I've checked that the version of pyspark is the same as spark, that I have downloaded from official site (3.2.1).
I am using Java JDK 8.
I've tried different versions of Java, Spark/Pyspark but everytime I use collect method on rdd I'm getting a tons of errors.
This is my sample program:
from pyspark.sql import SparkSession
ss = SparkSession.builder.master('local').appName('name').getOrCreate()
rd = ss.sparkContext.parallelize([1, 2, 3, 4, 5])
rd1 = rd.map(lambda x: x** 2).collect()
print(rd1)
And this is what I am getting. (I am sorry I have no idea how to put everything under spoiler)
Any help would be very appreciated!
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/05 21:03:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/05 21:04:06 WARN ProcfsMetricsGetter: Exception when trying to compute pagesize, as a result reporting of ProcessTree metrics is stopped
22/02/05 21:04:06 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:188)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:108)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:121)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:162)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:175)
... 14 more
22/02/05 21:04:06 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) (WIN-CH4BAQ3PTMC executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:188)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:108)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:121)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:162)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:175)
... 14 more
22/02/05 21:04:06 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Traceback (most recent call last):
File "C:\Users\user\PycharmProjects\pythonProject\spark.py", line 6, in <module>
print(rd.map(lambda x: x ** 2).collect())
File "C:\Users\user\PycharmProjects\pythonProject\venv\lib\site-packages\pyspark\rdd.py", line 950, in collect
sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "C:\Users\user\PycharmProjects\pythonProject\venv\lib\site-packages\py4j\java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "C:\Users\user\PycharmProjects\pythonProject\venv\lib\site-packages\pyspark\sql\utils.py", line 111, in deco
return f(*a, **kw)
File "C:\Users\user\PycharmProjects\pythonProject\venv\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (WIN-CH4BAQ3PTMC executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:188)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:108)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:121)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:162)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:175)
... 14 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
at scala.collection.immutable.List.foreach(List.scala:333)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
at scala.Option.foreach(Option.scala:437)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:188)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:108)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:121)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:162)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:175)
... 14 more
Nothing wrong with your program.
Its looks like the issue in your spark setup. in the command prompt check if you are able to get pyspark prompt without any error.
and also check python version and env variable.
BIG thanks to #blackbishop for the tip!
Python worker failed to connect back
All I did is just added
import findspark
findspark.init()
Everything works now!

Spark local mode in Kubernetes sometimes fail with : Invalid Spark URL

I've been running a very small pyspark (3.2.0) job in kubernetes , it work in local when I run it :
directly with DOCKER in my localhost
in KIND
in a GKE kubernetes
but in an another GKE kubernetes it fail with
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/usr/local/lib/python3.9/site-packages/pyspark/jars/spark-unsafe_2.12-3.2.0.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/17 14:34:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/12/17 14:34:17 ERROR SparkContext: Error initializing SparkContext.
org.apache.spark.SparkException: Invalid Spark URL: spark://HeartbeatReceiver#XXXXX_NAME_OF_POD_XXXXXXX.2855fc:40677
at org.apache.spark.rpc.RpcEndpointAddress$.apply(RpcEndpointAddress.scala:66)
at org.apache.spark.rpc.netty.NettyRpcEnv.asyncSetupEndpointRefByURI(NettyRpcEnv.scala:140)
at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:101)
at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:109)
at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
at org.apache.spark.executor.Executor.<init>(Executor.scala:218)
at org.apache.spark.scheduler.local.LocalEndpoint.<init>(LocalSchedulerBackend.scala:64)
at org.apache.spark.scheduler.local.LocalSchedulerBackend.start(LocalSchedulerBackend.scala:132)
at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:220)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:581)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown Source)
at java.base/java.lang.reflect.Constructor.newInstance(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Unknown Source)
21/12/17 14:34:17 ERROR Utils: Uncaught exception in thread Thread-3
java.lang.NullPointerException
at org.apache.spark.scheduler.local.LocalSchedulerBackend.org$apache$spark$scheduler$local$LocalSchedulerBackend$$stop(LocalSchedulerBackend.scala:173)
at org.apache.spark.scheduler.local.LocalSchedulerBackend.stop(LocalSchedulerBackend.scala:144)
at org.apache.spark.scheduler.TaskSchedulerImpl.stop(TaskSchedulerImpl.scala:927)
at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2516)
at org.apache.spark.SparkContext.$anonfun$stop$12(SparkContext.scala:2086)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1442)
at org.apache.spark.SparkContext.stop(SparkContext.scala:2086)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:677)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown Source)
at java.base/java.lang.reflect.Constructor.newInstance(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Unknown Source)
21/12/17 14:34:17 WARN MetricsSystem: Stopping a MetricsSystem that is not running
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/pyspark/sql/session.py", line 228, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "/usr/local/lib/python3.9/site-packages/pyspark/context.py", line 392, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "/usr/local/lib/python3.9/site-packages/pyspark/context.py", line 146, in __init__
self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
File "/usr/local/lib/python3.9/site-packages/pyspark/context.py", line 209, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "/usr/local/lib/python3.9/site-packages/pyspark/context.py", line 329, in _initialize_context
return self._jvm.JavaSparkContext(jconf)
File "/usr/local/lib/python3.9/site-packages/py4j/java_gateway.py", line 1573, in __call__
return_value = get_return_value(
File "/usr/local/lib/python3.9/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: org.apache.spark.SparkException: Invalid Spark URL: spark://HeartbeatReceiver#XXXXX_NAME_OF_POD_XXXXXXX.2855fc:40677
at org.apache.spark.rpc.RpcEndpointAddress$.apply(RpcEndpointAddress.scala:66)
at org.apache.spark.rpc.netty.NettyRpcEnv.asyncSetupEndpointRefByURI(NettyRpcEnv.scala:140)
at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:101)
at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:109)
at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
at org.apache.spark.executor.Executor.<init>(Executor.scala:218)
at org.apache.spark.scheduler.local.LocalEndpoint.<init>(LocalSchedulerBackend.scala:64)
at org.apache.spark.scheduler.local.LocalSchedulerBackend.start(LocalSchedulerBackend.scala:132)
at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:220)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:581)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown Source)
at java.base/java.lang.reflect.Constructor.newInstance(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Unknown Source)
the pyspark only settings :
SparkSession \
.builder \
.appName(app_name) \
.config("spark.driver.memory", "1g") \
.config("spark.master", "local")
running in a docker container
FROM python:3.9-slim-buster AS py3
FROM eclipse-temurin:11-jre-focal
COPY --from=py3 / /
RUN pip install pyspark
...
If I add to my second kubernetes pod deployment ( the one failing ) the ENV VAR :
"SPARK_LOCAL_HOSTNAME": "localhost"
then it work, do you have any idea why it work sometimes without ?
Thank you

Issue loading data set using spark-redis

I am trying to load a data set with spark-redis, but the operation always fail. The spark dataframe that I am trying to write has 85 million rows, but the write operation roughly fails after 25 million rows in. I wonder how to solve this kind of problem.
Here are the operations that I execute in my Python script:
SPARK_JARS = ['/home/jovyan/jedis-3.6.0.jar', '/home/jovyan/spark-redis_2.12-2.6.0.jar']
spark = (SparkSession.builder.master(master_uri).appName('redis.test')
.config('spark.executor.memory', '28g')
.config('spark.cores.max', 16)
.config('spark.redis.host', REDIS_HOST)
.config('spark.redis.port', 6379)
.config('spark.redis.db', 0)
.config('spark.sql.debug.maxToStringFields', 65535)
.config('spark.jars', ','.join(SPARK_JARS)).enableHiveSupport().getOrCreate())
df = spark.sql('select * from input_table')
df.write.format("org.apache.spark.sql.redis").option("table", "output_table").option("key.column", "id").option("dbNum", 0).save();
I am trying to store information from an Iceberg table to a Redis hash. The version of spark-redis I am using is spark-redis_2.12-2.6.0.jar. I am running my script on Spark 3.1.1 and the Redis cluster I am trying to access uses version 6.0.4. When I run the script, it starts loading the data in the hash from for a couple of minutes. Then, a SocketTimeoutException is raised. But the data continues to be loaded in the hash. However, after 10 minutes (this varies from session to session), there is an additional failure and from this point, I cannot connect to the Redis data store anymore (connection refused). This connection refusal state is temporary, but can last a few hours.
Here is the log (I had to skip parts of it because it was too long to fit in a message):
ANTLR Tool version 4.8 used for code generation does not match the current runtime version 4.7.1ANTLR Runtime version 4.8 used for parser compilation does not match the current runtime version 4.7.1ANTLR Tool version 4.8 used for code generation does not match the current runtime version 4.7.1ANTLR Runtime version 4.8 used for parser compilation does not match 21/06/07 17:26:08 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
21/06/07 17:30:07 WARN TaskSetManager: Lost task 13.0 in stage 2.0 (TID 16) (x.x.x.x executor 0): redis.clients.jedis.exceptions.JedisConnectionException: java.net.SocketTimeoutException: Read timed out
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:205)
at redis.clients.jedis.util.RedisInputStream.readByte(RedisInputStream.java:43)
at redis.clients.jedis.Protocol.process(Protocol.java:158)
at redis.clients.jedis.Protocol.read(Protocol.java:223)
at redis.clients.jedis.Connection.readProtocolWithCheckingBroken(Connection.java:352)
at redis.clients.jedis.Connection.getMany(Connection.java:364)
at redis.clients.jedis.Pipeline.sync(Pipeline.java:98)
at com.redislabs.provider.redis.util.PipelineUtils$.$anonfun$foreachWithPipeline$1(PipelineUtils.scala:71)
at com.redislabs.provider.redis.util.PipelineUtils$.$anonfun$foreachWithPipeline$1$adapted(PipelineUtils.scala:67)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
at com.redislabs.provider.redis.util.PipelineUtils$.foreachWithPipeline(PipelineUtils.scala:67)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$8(RedisSourceRelation.scala:143)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$8$adapted(RedisSourceRelation.scala:141)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$6(RedisSourceRelation.scala:141)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$6$adapted(RedisSourceRelation.scala:138)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$5(RedisSourceRelation.scala:138)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$5$adapted(RedisSourceRelation.scala:136)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1020)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1020)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.net.SocketInputStream.read(SocketInputStream.java:127)
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:199)
… 35 more
21/06/07 17:41:57 WARN TaskSetManager: Lost task 19.0 in stage 2.0 (TID 22) (x.x.x.x executor 0): redis.clients.jedis.exceptions.JedisConnectionException: java.net.SocketTimeoutException: Read timed out
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:205)
at redis.clients.jedis.util.RedisInputStream.readByte(RedisInputStream.java:43)
…
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.net.SocketInputStream.read(SocketInputStream.java:127)
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:199)
… 35 more
21/06/07 17:52:54 WARN TaskSetManager: Lost task 4.2 in stage 2.0 (TID 42) (x.x.x.x executor 0): redis.clients.jedis.exceptions.JedisConnectionException: Unexpected end of stream.
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:202)
at redis.clients.jedis.util.RedisInputStream.readByte(RedisInputStream.java:43)
…
21/06/07 17:52:54 WARN TaskSetManager: Lost task 2.1 in stage 2.0 (TID 46) (x.x.x.x executor 0): redis.clients.jedis.exceptions.JedisConnectionException: java.net.SocketException: Connection reset
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:205)
…
Caused by: java.net.SocketException: Connection reset
at java.net.SocketInputStream.read(SocketInputStream.java:210)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.net.SocketInputStream.read(SocketInputStream.java:127)
at redis.clients.jedis.util.RedisInputStream.ensureFill(RedisInputStream.java:199)
… 35 more
21/06/07 17:52:56 WARN TaskSetManager: Lost task 4.3 in stage 2.0 (TID 52) (x.x.x.x executor 0): redis.clients.jedis.exceptions.JedisConnectionException: Could not get a resource from the pool
at redis.clients.jedis.util.Pool.getResource(Pool.java:84)
…
Caused by: redis.clients.jedis.exceptions.JedisConnectionException: Failed to create socket.
at redis.clients.jedis.DefaultJedisSocketFactory.createSocket(DefaultJedisSocketFactory.java:110)
at redis.clients.jedis.Connection.connect(Connection.java:226)
…
Caused by: java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:607)
at redis.clients.jedis.DefaultJedisSocketFactory.createSocket(DefaultJedisSocketFactory.java:80)
… 38 more
21/06/07 17:52:56 ERROR TaskSetManager: Task 4 in stage 2.0 failed 4 times; aborting job
Traceback (most recent call last):
File “redis_load.py”, line 106, in
df = spark.sql(‘select * from input_table’)
File “/opt/conda/lib/python3.8/site-packages/pyspark/sql/readwriter.py”, line 1107, in save
self._jwrite.save()
File “/opt/conda/lib/python3.8/site-packages/py4j/java_gateway.py”, line 1304, in call
return_value = get_return_value(
File “/opt/conda/lib/python3.8/site-packages/pyspark/sql/utils.py”, line 111, in deco
return f(*a, **kw)
File “/opt/conda/lib/python3.8/site-packages/py4j/protocol.py”, line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o90.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 2.0 failed 4 times, most recent failure: Lost task 4.3 in stage 2.0 (TID 52) (x.x.x.x executor 0): redis.clients.jedis.exceptions.JedisConnectionException: Could not get a resource from the pool
at redis.clients.jedis.util.Pool.getResource(Pool.java:84)
at redis.clients.jedis.JedisPool.getResource(JedisPool.java:366)
at com.redislabs.provider.redis.ConnectionPool$.connect(ConnectionPool.scala:35)
at com.redislabs.provider.redis.RedisEndpoint.connect(RedisConfig.scala:72)
at com.redislabs.provider.redis.RedisNode.connect(RedisConfig.scala:89)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$8(RedisSourceRelation.scala:142)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$8$adapted(RedisSourceRelation.scala:141)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$6(RedisSourceRelation.scala:141)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$6$adapted(RedisSourceRelation.scala:138)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$5(RedisSourceRelation.scala:138)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$5$adapted(RedisSourceRelation.scala:136)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1020)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1020)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: redis.clients.jedis.exceptions.JedisConnectionException: Failed to create socket.
at redis.clients.jedis.DefaultJedisSocketFactory.createSocket(DefaultJedisSocketFactory.java:110)
at redis.clients.jedis.Connection.connect(Connection.java:226)
at redis.clients.jedis.BinaryClient.connect(BinaryClient.java:135)
at redis.clients.jedis.BinaryJedis.connect(BinaryJedis.java:309)
at redis.clients.jedis.BinaryJedis.initializeFromClientConfig(BinaryJedis.java:87)
at redis.clients.jedis.BinaryJedis.(BinaryJedis.java:292)
at redis.clients.jedis.Jedis.(Jedis.java:167)
at redis.clients.jedis.JedisFactory.makeObject(JedisFactory.java:177)
at org.apache.commons.pool2.impl.GenericObjectPool.create(GenericObjectPool.java:889)
at org.apache.commons.pool2.impl.GenericObjectPool.borrowObject(GenericObjectPool.java:424)
at org.apache.commons.pool2.impl.GenericObjectPool.borrowObject(GenericObjectPool.java:349)
at redis.clients.jedis.util.Pool.getResource(Pool.java:75)
… 27 more
Caused by: java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:607)
at redis.clients.jedis.DefaultJedisSocketFactory.createSocket(DefaultJedisSocketFactory.java:80)
… 38 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2267)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1020)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1018)
at org.apache.spark.sql.Dataset.$anonfun$foreachPartition$1(Dataset.scala:2906)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.Dataset.$anonfun$withNewRDDExecutionId$1(Dataset.scala:3676)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withNewRDDExecutionId(Dataset.scala:3674)
at org.apache.spark.sql.Dataset.foreachPartition(Dataset.scala:2906)
at org.apache.spark.sql.redis.RedisSourceRelation.insert(RedisSourceRelation.scala:136)
at org.apache.spark.sql.redis.DefaultSource.createRelation(DefaultSource.scala:30)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:301)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: redis.clients.jedis.exceptions.JedisConnectionException: Could not get a resource from the pool
at redis.clients.jedis.util.Pool.getResource(Pool.java:84)
at redis.clients.jedis.JedisPool.getResource(JedisPool.java:366)
at com.redislabs.provider.redis.ConnectionPool$.connect(ConnectionPool.scala:35)
at com.redislabs.provider.redis.RedisEndpoint.connect(RedisConfig.scala:72)
at com.redislabs.provider.redis.RedisNode.connect(RedisConfig.scala:89)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$8(RedisSourceRelation.scala:142)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$8$adapted(RedisSourceRelation.scala:141)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$6(RedisSourceRelation.scala:141)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$6$adapted(RedisSourceRelation.scala:138)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$5(RedisSourceRelation.scala:138)
at org.apache.spark.sql.redis.RedisSourceRelation.$anonfun$insert$5$adapted(RedisSourceRelation.scala:136)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1020)
at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1020)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2242)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
… 1 more
Caused by: redis.clients.jedis.exceptions.JedisConnectionException: Failed to create socket.
at redis.clients.jedis.DefaultJedisSocketFactory.createSocket(DefaultJedisSocketFactory.java:110)
at redis.clients.jedis.Connection.connect(Connection.java:226)
at redis.clients.jedis.BinaryClient.connect(BinaryClient.java:135)
at redis.clients.jedis.BinaryJedis.connect(BinaryJedis.java:309)
at redis.clients.jedis.BinaryJedis.initializeFromClientConfig(BinaryJedis.java:87)
at redis.clients.jedis.BinaryJedis.(BinaryJedis.java:292)
at redis.clients.jedis.Jedis.(Jedis.java:167)
at redis.clients.jedis.JedisFactory.makeObject(JedisFactory.java:177)
at org.apache.commons.pool2.impl.GenericObjectPool.create(GenericObjectPool.java:889)
at org.apache.commons.pool2.impl.GenericObjectPool.borrowObject(GenericObjectPool.java:424)
at org.apache.commons.pool2.impl.GenericObjectPool.borrowObject(GenericObjectPool.java:349)
at redis.clients.jedis.util.Pool.getResource(Pool.java:75)
… 27 more
Caused by: java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method)
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:607)
at redis.clients.jedis.DefaultJedisSocketFactory.createSocket(DefaultJedisSocketFactory.java:80)
… 38 more
21/06/07 17:52:57 WARN TaskSetManager: Lost task 15.2 in stage 2.0 (TID 67) (x.x.x.x executor 0): TaskKilled (Stage cancelled)
It looks like there is some connection pool that gets exhausted, but I have no idea how you can tell spark-redis how connections should be allocated and when to recycle them. I also do not know if the problem arise because of the network configuration or maybe the Redis server configuration. Any pointers on how to troubleshoot this problems would be appreciated.
I am also getting similar problems when I run python script of Java app that uses the Redis API to read and write to Redis using pipelines.
I am seeing many of these logs:
Asynchronous AOF fsync is taking too long (disk is busy?)
It looks like while performing the write, because I have enabled persistence in my Redis server, the server is periodically writing the hash to disk which causes a delay in response from time to time. The socket (read) timeout is 2 seconds by default I believe. It seems like the sevrer may not respond within this limit at times. I have increased this to a much higher value doing .config("spark.redis.timeout", DEFAULT_TIMEOUT) on the Spark context (I could have also set the timeout on the dataframe adding the following option to the write operation: .option("timeout", DEFAULT_TIMEOUT).

py4j.protocol.Py4JJavaError with PySpark [duplicate]

I'm trying to complete this Spark tutorial.
After installing Spark on local machine (Win10 64, Python 3, Spark 2.4.0) and setting all env variables (HADOOP_HOME, SPARK_HOME, etc) I'm trying to run a simple WordCount.py Spark application:
from pyspark import SparkContext, SparkConf
if __name__ == "__main__":
conf = SparkConf().setAppName("word count").setMaster("local[2]")
sc = SparkContext(conf = conf)
lines = sc.textFile("C:/Users/mjdbr/Documents/BigData/python-spark-tutorial/in/word_count.text")
words = lines.flatMap(lambda line: line.split(" "))
wordCounts = words.countByValue()
for word, count in wordCounts.items():
print("{} : {}".format(word, count))
After running it from the command line:
spark-submit WordCount.py
I get below error.
I checked (by commenting out line by line) that it crashes at
wordCounts = words.countByValue()
Any idea what should I check to make it work?
Traceback (most recent call last):
File "C:\Users\mjdbr\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Users\mjdbr\Anaconda3\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Spark\spark-2.4.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 25, in <module>
ModuleNotFoundError: No module named 'resource'
18/11/10 23:16:58 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
at java.net.PlainSocketImpl.accept(Unknown Source)
at java.net.ServerSocket.implAccept(Unknown Source)
at java.net.ServerSocket.accept(Unknown Source)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
... 14 more
18/11/10 23:16:58 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Traceback (most recent call last):
File "C:/Users/mjdbr/Documents/BigData/python-spark-tutorial/rdd/WordCount.py", line 19, in <module>
wordCounts = words.countByValue()
File "C:\Spark\spark-2.4.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 1261, in countByValue
File "C:\Spark\spark-2.4.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 844, in reduce
File "C:\Spark\spark-2.4.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 816, in collect
File "C:\Spark\spark-2.4.0-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
File "C:\Spark\spark-2.4.0-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure:
Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
at java.net.PlainSocketImpl.accept(Unknown Source)
at java.net.ServerSocket.implAccept(Unknown Source)
at java.net.ServerSocket.accept(Unknown Source)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
... 14 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:170)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:108)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
at java.net.PlainSocketImpl.accept(Unknown Source)
at java.net.ServerSocket.implAccept(Unknown Source)
at java.net.ServerSocket.accept(Unknown Source)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:164)
... 14 more
As suggested by theplatypus - checked if the 'resource' module can be imported directly from terminal - apparently not:
>>> import resource
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
ModuleNotFoundError: No module named 'resource'
In terms of installation resources - I followed instructions from this tutorial:
downloaded spark-2.4.0-bin-hadoop2.7.tgz from Apache Spark website
un-zipped it to my C-drive
already had Python_3 installed (Anaconda distribution) as well as Java
created local 'C:\hadoop\bin' folder to store winutils.exe
created 'C:\tmp\hive' folder and gave Spark access to it
added environment variables (SPARK_HOME, HADOOP_HOME etc)
Is there any extra resource I should install?
I got the same error. I solved it installing the previous version of Spark (2.3 instead of 2.4). Now it works perfectly, maybe it is an issue of the lastest version of pyspark.
Set Env PYSPARK_PYTHON=python To Fix It.
The heart of the problem is the connection between pyspark and python, solved by redefining the environment variable.
I´ve just changed the environment variable's values PYSPARK_DRIVER_PYTHON from ipython to jupyter and PYSPARK_PYTHON from python3 to python.
Now I'm using Jupyter Notebook, Python 3.7, Java JDK 11.0.6, Spark 2.4.2
I had the same issue. I had set all the environment variables correctly but still wasn't able to resolve it
In my case,
import findspark
findspark.init()
adding this before even creating the sparkSession helped.
I was using Visual Studio Code on Windows 10 and spark version was 3.2.0. Python version is 3.9 .
Note: Initially check if the paths for HADOOP_HOME SPARK_HOME PYSPARK_PYTHON have been set correctly
Downgrading Spark back to 2.3.2 from 2.4.0 was not enough for me. I don't know why but in my case I had to create SparkContext from SparkSession like
sc = spark.sparkContext
Then the very same error disappeared.
Looking at the source of the error (worker.py#L25), it seems that the python interpreter used to instanciate a pyspark worker doesn't have access to the resource module, a built-in module referred in Python's doc as part of "Unix Specific Services".
Are you sure you can run pyspark on Windows (without some additional software like GOW or MingW at least), and so that you didn't skip some Windows-specific installation steps ?
Could you open a python console (the one used by pyspark) and see if you can >>> import resource without getting the same ModuleNotFoundError ? If you don't, then could you provide the ressources you used to install it on W10 ?
When you run the python installer, on the Customize Python section, make sure that the option Add python.exe to Path is selected. If this option is not selected, some of the PySpark utilities such as pyspark and spark-submit might not work. This worked for me! Happy Sharing :)
There seems to be many reasons for this error to occur. I still had the same problem despite my environmental variables being all correctly set.
In my case adding this
import findspark
findspark.init()
solved the problem.
I'am withjupyter notebook on windows10 with spark-3.1.2, python3.6.
I had to set:
HADOOP_HOME = [..]\winutils\hadoop-3.0.0
PATH = [..]\winutils\hadoop-3.0.0\bin
PYSPARK_PYTHON = python
You can find winutils here. I use Hadoop 3.0.0 and PySpark 3.2.1.

Spark job failing on Dataproc (it works on Databricks), error messages not clear to me

Update: I needed to increase the memory on the Dataproc nodes, but I couldn't get to the Spark UI for various reasons to see why the executors were dying. Coming back to this project with a little more Spark and GCP experience allowed me to quickly solve the issue.
====
I've been trying for a long time to get the predict phase of the ALS recommender model in pyspark to run on Dataproc. Update: Confirmed that this code does run successfully on Databricks.
Code:
spark = SparkSession.builder.appName("test-mf").getOrCreate()
model = ALSModel.load("gs://my-dataproc-bucket/trained-model")
userRecs = model.recommendForAllUsers(100).collect()
(I'm just doing the "collect" since it seems like the simplest operation to get the code to actually work--I was originally doing some select statements to try to process the data and that was failing as well.)
I get a ton of error messages, in a relatively quick time span (maybe 15 minutes from starting the job to final failure), none of which mean much to me or have yielded an easy smoking gun from googling.
Here's the last set of logs, let me know if you need anything earlier:
18/03/27 22:38:59 WARN org.apache.spark.ExecutorAllocationManager: No stages are running, but numRunningTasks != 0
Traceback (most recent call last):
File "/tmp/be3c5758e6694a4ca7f2911043f7a173/spark-matrix-factorization.py", line 35, in <module>
userRecs = model.recommendForAllUsers(100).collect()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 438, in collect
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o50.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 4.0 failed 4 times, most recent failure: Lost task 1.3 in stage 4.0 (TID 26, my-dataproc-cluster-w-1.c.my-gcp-project.internal, executor 10): ExecutorLostFailure (executor 10 exited caused by one of the running tasks) Reason: Container marked as failed: container_1520973147661_0018_01_000012 on host: my-dataproc-cluster-w-1.c.my-gcp-project.internal. Exit status: 1. Diagnostics: Exception from container-launch.
Container id: container_1520973147661_0018_01_000012
Exit code: 1
Stack trace: ExitCodeException exitCode=1:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:972)
at org.apache.hadoop.util.Shell.run(Shell.java:869)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1170)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:236)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:305)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:84)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Container exited with a non-zero exit code 1
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:2803)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2800)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2800)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2823)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2800)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
18/03/27 22:38:59 INFO org.spark_project.jetty.server.AbstractConnector: Stopped Spark#446a8845{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
18/03/27 22:38:59 WARN org.apache.spark.ExecutorAllocationManager: Attempted to mark unknown executor 10 idle
ERROR: (gcloud.dataproc.jobs.submit.pyspark) Job [be3c5758e6694a4ca7f2911043f7a173] entered state [ERROR] while waiting for [DONE].
I've been trying to see if there's more logs anywhere that could give me a more informative error message, but had no luck getting the proxy set up to see the UI in Dataproc and didn't find any messages after running gcloud dataproc clusters diagnose.
In response to Dennis below,
Machine types:
Master node
Standard (1 master, N workers)
Machine type
n1-standard-4 (4 vCPU, 15.0 GB memory)
Primary disk size
500 GB
Worker nodes
2
Machine type
n1-standard-4 (4 vCPU, 15.0 GB memory)
Primary disk size
500 GB
Local SSDs
0
Data size:
The entire trained ALS model (which contains all the data already) is only 104M.
Count instead of collect gives a similar problem:
18/03/28 22:37:08 ERROR org.apache.spark.scheduler.TaskSetManager: Task 3 in stage 4.0 failed 4 times; aborting job
18/03/28 22:37:08 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.3 in stage 4.0 (TID 18, my-dataproc-cluster-w-1.c.my-dataproc-cluster.internal, executor 6): ExecutorLostFailure (executor 6 exited caused by one of the running tasks) Reason: Container marked as failed: container_1520973147661_0019_01_000008 on host: my-dataproc-cluster-w-1.c.my-dataproc-cluster.internal. Exit status: 1. Diagnostics: Exception from container-launch.
Container id: container_1520973147661_0019_01_000008
Exit code: 1
Stack trace: ExitCodeException exitCode=1:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:972)
at org.apache.hadoop.util.Shell.run(Shell.java:869)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1170)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:236)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:305)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:84)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Container exited with a non-zero exit code 1
18/03/28 22:37:08 WARN org.apache.spark.ExecutorAllocationManager: No stages are running, but numRunningTasks != 0
18/03/28 22:37:08 WARN org.apache.spark.ExecutorAllocationManager: Attempted to mark unknown executor 6 idle
Traceback (most recent call last):
File "/tmp/9d05f24785474f1f84720daa115af584/spark-matrix-factorization.py", line 35, in <module>
userRecs = model.recommendForAllUsers(100).count()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 427, in count
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o50.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 4.0 failed 4 times, most recent failure: Lost task 3.3 in stage 4.0 (TID 19, my-dataproc-cluster-w-1.c.my-dataproc-cluster.internal, executor 6): ExecutorLostFailure (executor 6 exited caused by one of the running tasks) Reason: Container marked as failed: container_1520973147661_0019_01_000008 on host: my-dataproc-cluster-w-1.c.my-dataproc-cluster.internal. Exit status: 1. Diagnostics: Exception from container-launch.
Container id: container_1520973147661_0019_01_000008
Exit code: 1
Stack trace: ExitCodeException exitCode=1:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:972)
at org.apache.hadoop.util.Shell.run(Shell.java:869)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1170)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:236)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:305)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:84)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
Container exited with a non-zero exit code 1
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278)
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2430)
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2429)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
at org.apache.spark.sql.Dataset.count(Dataset.scala:2429)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
18/03/28 22:37:08 INFO org.spark_project.jetty.server.AbstractConnector: Stopped Spark#ee58b0b{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
ERROR: (gcloud.dataproc.jobs.submit.pyspark) Job [9d05f24785474f1f84720daa115af584] entered state [ERROR] while waiting for [DONE].

Resources