I recently installed Spark 2.3 on my Windows machine (with Java 8) and was able to run it via Jupyter Notebooks (Python 3).
Suddenly it stopped working - I get below error when trying to instantiate SparkContext within Notebook:
from pyspark import SparkContext
sc = pyspark.SparkContext()
Splitting the code on one-line-per-cell basis shows that it's the 2nd line that causes it.
It seems to be purely Notebook issue, as I'm still able to execute .py files with 'spark-submit' via command line.
Any idea how to fix it?
-------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-78-57590c71cf44> in <module>()
1 from pyspark import SparkContext
----> 2 sc = pyspark.SparkContext()
~\Anaconda3\lib\site-packages\pyspark\context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
116 try:
117 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
--> 118 conf, jsc, profiler_cls)
119 except:
120 # If an error occurs, clean up in order to allow future SparkContext creation:
~\Anaconda3\lib\site-packages\pyspark\context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
186 self._accumulatorServer = accumulators._start_update_server()
187 (host, port) = self._accumulatorServer.server_address
--> 188 self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
189 self._jsc.sc().register(self._javaAccumulator)
190
~\Anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1523 answer = self._gateway_client.send_command(command)
1524 return_value = get_return_value(
-> 1525 answer, self._gateway_client, None, self._fqn)
1526
1527 for temp_arg in temp_args:
~\Anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 332 format(target_id, ".", name, value))
333 else:
334 raise Py4JError(
Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonAccumulatorV2. Trace:
py4j.Py4JException: Constructor org.apache.spark.api.python.PythonAccumulatorV2([class java.lang.String, class java.lang.Integer]) does not exist
at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
at py4j.Gateway.invoke(Gateway.java:237)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
I had also the same issue. It was basically due to a conflict in the pyspark/spark versions, so please check to the "sub-version" also matches, e.g. v2.3.1 vs v2.3.2
You can use:
pip install pyspark==2.3.x to get the right version
I had the same issue, I solved it by updating my pyspark to the latest version.
Related
im receiving Java gateway process exited before sending its port number error when i set pyspark master equal to yarn_client while defining spark session configs. its working fine while setting master local.
im running these on jupiter notebook and the error is returned there.
my code is like below
conf = SparkConf().setAppName("Spark_hadoop").setMaster("yarn_client").set("spark.executor.memory","5g")
sc = SparkContext(conf=conf)
and full error is like this
RuntimeError Traceback (most recent call last)
<ipython-input-3-2a71daf20656> in <module>
1 conf = SparkConf().setAppName("Spark_hadoop").setMaster("yarn_client").set("spark.executor.memory","5g")
----> 2 sc = SparkContext(conf=conf)
3 sc=SparkContext.getOrCreate(conf=create_spark_conf().setMaster("local[4]").set("spark.driver.memory","8g").set("spark.executor.memory", '8g').set('spark.executor.cores', 4))
4 sc.setLogLevel("ERROR")
5 sqlContext = SQLContext(sc)
~/.local/lib/python3.6/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
142 " is not allowed as it is a security risk.")
143
--> 144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
~/.local/lib/python3.6/site-packages/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
337 with SparkContext._lock:
338 if not SparkContext._gateway:
--> 339 SparkContext._gateway = gateway or launch_gateway(conf)
340 SparkContext._jvm = SparkContext._gateway.jvm
341
~/.local/lib/python3.6/site-packages/pyspark/java_gateway.py in launch_gateway(conf, popen_kwargs)
106
107 if not os.path.isfile(conn_info_file):
--> 108 raise RuntimeError("Java gateway process exited before sending its port number")
109
110 with open(conn_info_file, "rb") as info:
RuntimeError: Java gateway process exited before sending its port number
im using ubuntu 18. hadoop and yarn are all set up and i have tested both java 8 & 11
JAVA_HOME , YARN_CONF_DIR , HADOOP_CONF_DIR are all set in ~/.bashrc
after some modification the error changed to this:
Py4JJavaError Traceback (most recent call last)
<ipython-input-7-2ee19c87679b> in <module>
2 findspark.init()
3 conf = SparkConf().setAppName("Spark_hadoop").setMaster("yarn").set("spark.executor.memory","5g")
----> 4 sc = SparkContext(conf=conf)
5 sc.setLogLevel("ERROR")
6 sqlContext = SQLContext(sc)
~/.local/lib/python3.6/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
145 try:
146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
--> 147 conf, jsc, profiler_cls)
148 except:
149 # If an error occurs, clean up in order to allow future SparkContext creation:
~/.local/lib/python3.6/site-packages/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
207
208 # Create the Java SparkContext through Py4J
--> 209 self._jsc = jsc or self._initialize_context(self._conf._jconf)
210 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
211 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
~/.local/lib/python3.6/site-packages/pyspark/context.py in _initialize_context(self, jconf)
327 Initialize SparkContext in function to allow subclass specific initialization
328 """
--> 329 return self._jvm.JavaSparkContext(jconf)
330
331 #classmethod
~/.local/lib/python3.6/site-packages/py4j/java_gateway.py in __call__(self, *args)
1584 answer = self._gateway_client.send_command(command)
1585 return_value = get_return_value(
-> 1586 answer, self._gateway_client, None, self._fqn)
1587
1588 for temp_arg in temp_args:
~/.local/lib/python3.6/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
~/.local/lib/python3.6/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: org.apache.spark.SparkException: Application application_1675283388270_0012 failed 2 times due to ApplicationMaster for attempt appattempt_1675283388270_0012_000002 timed out. Failing the application.
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.waitForApplication(YarnClientSchedulerBackend.scala:98)
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend.start(YarnClientSchedulerBackend.scala:65)
at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:222)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:585)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
i have one master node and 2 slave nodes each on separate vm (3 vm's in total)
in master node yarn-site.xml file is like as default and i havent modified it but in slaves is have added lines below:
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop-master</value>
</property>
yarn and spark both run on master vm and yarn dashboard is accessible from http://hadoop-master:8088/cluster
this is a shot of yarn dashboard
description of one of the applications 1
description of one of the applications 2
yarn_client is not a valid master.
It is only "yarn". And you need a valid yarn-site.xml configured in HADOOP_CONF_DIR environment variable.
To deploy as a "client", you'd use something like spark-submit --deploy-mode=client app.py.
Also, for local testing, I'd suggest not using a whole 5g (or 8g? Why are you setting the config twice and overriding the master?) of memory.
Trying to learn to use pyspark with jupyter notebooks. created an env for pyspark and installed it in anaconda, python version is 3.10.8 and java version in the env is:
openjdk 17.0.3 2022-04-19 LTS
OpenJDK Runtime Environment Zulu17.34+19-CA (build 17.0.3+7-LTS)
OpenJDK 64-Bit Server VM Zulu17.34+19-CA (build 17.0.3+7-LTS, mixed mode, sharing)
when opening jupyter labs and trying to run my first spark session i've used:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("JupNote").getOrCreate()
and get the error:
Py4JJavaError Traceback (most recent call last)
c:\Users\frezanlutu\Skills_Training\BigData\pyspark.ipynb Cell 3 in <cell line: 1>()
----> 1 spark = SparkSession.builder.appName("JupNote").getOrCreate()
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\sql\session.py:228, in SparkSession.Builder.getOrCreate(self)
226 sparkConf.set(key, value)
227 # This SparkContext may be an existing one.
--> 228 sc = SparkContext.getOrCreate(sparkConf)
229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
230 # by all sessions.
231 session = SparkSession(sc)
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:392, in SparkContext.getOrCreate(cls, conf)
390 with SparkContext._lock:
391 if SparkContext._active_spark_context is None:
--> 392 SparkContext(conf=conf or SparkConf())
393 return SparkContext._active_spark_context
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:146, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
--> 146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
147 conf, jsc, profiler_cls)
148 except:
149 # If an error occurs, clean up in order to allow future SparkContext creation:
150 self.stop()
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:209, in SparkContext._do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
206 self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
208 # Create the Java SparkContext through Py4J
--> 209 self._jsc = jsc or self._initialize_context(self._conf._jconf)
210 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
211 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:329, in SparkContext._initialize_context(self, jconf)
325 def _initialize_context(self, jconf):
326 """
327 Initialize SparkContext in function to allow subclass specific initialization
328 """
--> 329 return self._jvm.JavaSparkContext(jconf)
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\py4j\java_gateway.py:1585, in JavaClass.__call__(self, *args)
1579 command = proto.CONSTRUCTOR_COMMAND_NAME +\
1580 self._command_header +\
1581 args_command +\
1582 proto.END_COMMAND_PART
1584 answer = self._gateway_client.send_command(command)
-> 1585 return_value = get_return_value(
1586 answer, self._gateway_client, None, self._fqn)
1588 for temp_arg in temp_args:
1589 temp_arg._detach()
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.storage.StorageUtils$
at org.apache.spark.storage.BlockManagerMasterEndpoint.<init>(BlockManagerMasterEndpoint.scala:110)
at org.apache.spark.SparkEnv$.$anonfun$create$9(SparkEnv.scala:348)
at org.apache.spark.SparkEnv$.registerOrLookupEndpoint$1(SparkEnv.scala:287)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:336)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:191)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:277)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:460)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)
i've also tried:
spark = SparkSession.builder.config("spark.driver.host", "localhost").appName("JupNote").getOrCreate()
after looking for some solutions but that produces the same error. anyone know if i'm missing anything or doing anything wrong?
Which spark version are you using? If you are using Spark < 3.3.0, it only support Java 8. You may check https://spark.apache.org/docs/3.3.0/#downloading
I built the jupyter/all-spark-notebook Docker image. I installed geomesa_pyspark and tried to run the following example commands from the official guide.
import geomesa_pyspark
import pyspark
from pyspark.sql import SparkSession
conf = geomesa_pyspark.configure(
jars=['/usr/local/spark/jars/geomesa-accumulo-spark-runtime_2.11-2.0.0.jar'],
packages=['geomesa_pyspark','pytz'],
spark_home='/usr/local/spark/').\
setAppName('MyTestApp')
#sc = pyspark.SparkContext()
spark = ( SparkSession
.builder
.config(conf=conf)
.enableHiveSupport()
.getOrCreate()
)
The code as is gives this error, while uncommenting the SparkContext creation statement executes correctly.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-1-22f9613a0be5> in <module>
31 .builder
32 .master('spark://spark-master:7077')
---> 33 .config(conf=conf)
34 .enableHiveSupport()
35 .getOrCreate()
/usr/local/spark/python/pyspark/sql/session.py in getOrCreate(self)
171 for key, value in self._options.items():
172 sparkConf.set(key, value)
--> 173 sc = SparkContext.getOrCreate(sparkConf)
174 # This SparkContext may be an existing one.
175 for key, value in self._options.items():
/usr/local/spark/python/pyspark/context.py in getOrCreate(cls, conf)
365 with SparkContext._lock:
366 if SparkContext._active_spark_context is None:
--> 367 SparkContext(conf=conf or SparkConf())
368 return SparkContext._active_spark_context
369
/usr/local/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
134 try:
135 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
--> 136 conf, jsc, profiler_cls)
137 except:
138 # If an error occurs, clean up in order to allow future SparkContext creation:
/usr/local/spark/python/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
196
197 # Create the Java SparkContext through Py4J
--> 198 self._jsc = jsc or self._initialize_context(self._conf._jconf)
199 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
200 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
/usr/local/spark/python/pyspark/context.py in _initialize_context(self, jconf)
304 Initialize SparkContext in function to allow subclass specific initialization
305 """
--> 306 return self._jvm.JavaSparkContext(jconf)
307
308 #classmethod
/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1523 answer = self._gateway_client.send_command(command)
1524 return_value = get_return_value(
-> 1525 answer, self._gateway_client, None, self._fqn)
1526
1527 for temp_arg in temp_args:
/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.AbstractMethodError: io.netty.util.concurrent.MultithreadEventExecutorGroup.newChild(Ljava/util/concurrent/ThreadFactory;[Ljava/lang/Object;)Lio/netty/util/concurrent/EventExecutor;
at io.netty.util.concurrent.MultithreadEventExecutorGroup.<init>(MultithreadEventExecutorGroup.java:64)
at io.netty.channel.MultithreadEventLoopGroup.<init>(MultithreadEventLoopGroup.java:59)
at io.netty.channel.nio.NioEventLoopGroup.<init>(NioEventLoopGroup.java:78)
at io.netty.channel.nio.NioEventLoopGroup.<init>(NioEventLoopGroup.java:73)
at io.netty.channel.nio.NioEventLoopGroup.<init>(NioEventLoopGroup.java:60)
at org.apache.spark.network.util.NettyUtils.createEventLoop(NettyUtils.java:50)
at org.apache.spark.network.client.TransportClientFactory.<init>(TransportClientFactory.java:102)
at org.apache.spark.network.TransportContext.createClientFactory(TransportContext.java:99)
at org.apache.spark.rpc.netty.NettyRpcEnv.<init>(NettyRpcEnv.scala:71)
at org.apache.spark.rpc.netty.NettyRpcEnvFactory.create(NettyRpcEnv.scala:461)
at org.apache.spark.rpc.RpcEnv$.create(RpcEnv.scala:57)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:249)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:175)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:257)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:424)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
I use the following versions:
Spark 2.4.5
Hadoop 2.7
java-1.8.0-openjdk-amd64.
Why does it need a SparkContext? Shouldn't it be included in the SparkSession?
It may just be a classpath issue. AbstractMethodErrors indicate a classpath problem - see for example this post. Since the error is occurring in netty, you should check your classpath for different versions of netty jars.
I try to connect to remote spark master from notebook on my local machine.
When I try creating sparkContext
sc = pyspark.SparkContext(master = "spark://remote-spark-master-hostname:7077",
appName="jupyter notebook_test"),
I get following exception:
/opt/.venv/lib/python3.7/site-packages/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
134 try:
135 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
--> 136 conf, jsc, profiler_cls)
137 except:
138 # If an error occurs, clean up in order to allow future SparkContext creation:
/opt/.venv/lib/python3.7/site-packages/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
196
197 # Create the Java SparkContext through Py4J
--> 198 self._jsc = jsc or self._initialize_context(self._conf._jconf)
199 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
200 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
/opt/.venv/lib/python3.7/site-packages/pyspark/context.py in _initialize_context(self, jconf)
304 Initialize SparkContext in function to allow subclass specific initialization
305 """
--> 306 return self._jvm.JavaSparkContext(jconf)
307
308 #classmethod
/opt/.venv/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
1523 answer = self._gateway_client.send_command(command)
1524 return_value = get_return_value(
-> 1525 answer, self._gateway_client, None, self._fqn)
1526
1527 for temp_arg in temp_args:
/opt/.venv/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.IllegalArgumentException: requirement failed: Can only call getServletHandlers on a running MetricsSystem
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.metrics.MetricsSystem.getServletHandlers(MetricsSystem.scala:91)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:516)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:745)
At the same time, I can create spark context using the same interpreter in interactive mode.
What I should do to connect to remote spark master from my local jupyter notebook?
I solved my problem using #HristoIliev advice.
In my case, PYSPARK_PYTHON was not set inside the jupyter environment. Simple solution:
import os
os.environ["PYSPARK_PYTHON"] = '/opt/.venv/bin/python'
os.environ["SPARK_HOME"] = '/opt/spark'
Also you can use findspark for this, but I didn't test it.
I'm using spark2.0 in notebook, this is the initial set up:
spark = SparkSession.builder \
.appName("NewApp") \
.config("spark.driver.maxResultSize", "600g") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.rpc.message.maxSize",10737418240) \
.config("spark.executor.heartbeatInterval",10000000) \
.getOrCreate()
/usr/local/spark-2.0.1/python/pyspark/sql/session.py in getOrCreate(self)
167 for key, value in self._options.items():
168 sparkConf.set(key, value)
--> 169 sc = SparkContext.getOrCreate(sparkConf)
170 # This SparkContext may be an existing one.
171 for key, value in self._options.items():
/usr/local/spark-2.0.1/python/pyspark/context.py in getOrCreate(cls, conf)
292 with SparkContext._lock:
293 if SparkContext._active_spark_context is None:
--> 294 SparkContext(conf=conf or SparkConf())
295 return SparkContext._active_spark_context
296
/usr/local/spark-2.0.1/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
113 try:
114 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
--> 115 conf, jsc, profiler_cls)
116 except:
117 # If an error occurs, clean up in order to allow future SparkContext creation:
/usr/local/spark-2.0.1/python/pyspark/context.py in _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
166
167 # Create the Java SparkContext through Py4J
--> 168 self._jsc = jsc or self._initialize_context(self._conf._jconf)
169 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
170 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
/usr/local/spark-2.0.1/python/pyspark/context.py in _initialize_context(self, jconf)
231 Initialize SparkContext in function to allow subclass specific initialization
232 """
--> 233 return self._jvm.JavaSparkContext(jconf)
234
235 #classmethod
/usr/local/spark-2.0.1/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in __call__(self, *args)
1399 answer = self._gateway_client.send_command(command)
1400 return_value = get_return_value(
-> 1401 answer, self._gateway_client, None, self._fqn)
1402
1403 for temp_arg in temp_args:
/usr/local/spark-2.0.1/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NumberFormatException: For input string: "10737418240"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:583)
at java.lang.Integer.parseInt(Integer.java:615)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:29)
at org.apache.spark.SparkConf$$anonfun$getInt$2.apply(SparkConf.scala:375)
at org.apache.spark.SparkConf$$anonfun$getInt$2.apply(SparkConf.scala:375)
at scala.Option.map(Option.scala:146)
at org.apache.spark.SparkConf.getInt(SparkConf.scala:375)
at org.apache.spark.util.RpcUtils$.maxMessageSizeBytes(RpcUtils.scala:61)
at org.apache.spark.MapOutputTrackerMaster.<init>(MapOutputTracker.scala:293)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:284)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:165)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:256)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:420)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
how could I solve this problem? I tried SparkContext.stop(), but it gives:
TypeError: stop() missing 1 required positional argument: 'self'
Another one question is my initial set up is getOrCreate() to my understanding if there is one then get it, if not create it, it still give this problem.
Here is the source of the error:
: java.lang.NumberFormatException: For input string: "10737418240"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:583)
10737418240 is larger than Int.MaxValue(2147483647). Use smaller value when calling:
.config("spark.rpc.message.maxSize", ...) \
Try to fix JAVA_HOME and restart cmd.