Spark/PySpark errors on mysterious missing /tmp file - apache-spark

I'm having issues with pyspark and a missing /tmp file. I've narrowed down the behavior to a short snippet.
>>> a=sc.parallelize([(16646160,1)])
>>> b=stuff
>>> # b=sc.parallelize(b.collect())
>>> a.join(b).take(10)
This fails, but if I include the commented line (which should be the same thing), then it succeeds. Here is the error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-101-90fe86df7879> in <module>()
3 b=stuff.map(lambda x:(16646160,1))
4 #b=sc.parallelize(b.collect())
----> 5 a.join(b).take(10)
6 b.take(10)
/usr/lib/spark/python/pyspark/rdd.py in take(self, num)
1109
1110 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1111 res = self.context.runJob(self, takeUpToNumLeft, p, True)
1112
1113 items += res
/usr/lib/spark/python/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
816 # SparkContext#runJob.
817 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 818 it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
819 return list(mappedRDD._collect_iterator_through_file(it))
820
/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 210.0 failed 1 times, most recent failure: Lost task 1.0 in stage 210.0 (TID 884, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/lib/spark/python/pyspark/worker.py", line 92, in main
command = pickleSer.loads(command.value)
File "/usr/lib/spark/python/pyspark/broadcast.py", line 106, in value
self._value = self.load(self._path)
File "/usr/lib/spark/python/pyspark/broadcast.py", line 87, in load
with open(path, 'rb', 1 << 20) as f:
IOError: [Errno 2] No such file or directory: '/tmp/spark-4a8c591e-9192-4198-a608-c7daa3a5d494/tmpuzsAVM'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:137)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:174)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:96)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:87)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply$mcV$sp(PythonRDD.scala:242)
at org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:204)
at org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:204)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1468)
at org.apache.spark.api.python.PythonRDD$WriterThread.run(PythonRDD.scala:203)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1214)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1203)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1202)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1202)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:696)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:696)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1420)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
In case you're wondering
>>> b.take(10)
[(16744491, 1),
(16203827, 1),
(16695357, 1),
(16958298, 1),
(16400458, 1),
(16810060, 1),
(11452497, 1),
(14803033, 1),
(15630426, 1),
(14917736, 1)]
So maybe (I thought) there's some weird number in there that overflows or something, and collecting and re-parallelizing "fixes" the problem. This next bit of code proves this assumption wrong.
>>> a=sc.parallelize([(16646160,1)])
>>> b=stuff.map(lambda x:(16646160,1))
>>> #b=sc.parallelize(b.collect())
>>> a.join(b).take(10)
It still breaks. (Here again including the comment line fixes the problem.)
So I'm apparently looking at some sort of spark/pyspark bug. Spark 1.2.0. Any idea?

Related

Error while using mapPartitions in Pyspark

I am new to Python spark and I am running the below spark code in the Jupyter notebook and getting AttributeError: 'NoneType' object has no attribute '_jvm'
My spark version is 3.0.1.
from pyspark.sql import functions as func
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator): yield func.sum(iterator)
parallel.mapPartitions(f).collect()
Find below the full error while running the code.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-55-44576a0dc413> in <module>
2 def valueSum(f): return func.sum(f)
3
----> 4 mapp.mapPartitions(valueSum).collect()
5 #one_through_9 = range(1,10)
6 #parallel = sc.parallelize(one_through_9, 3)
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
887 """
888 with SCCallSiteSync(self.context) as css:
--> 889 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
890 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
891
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 83, 192.168.43.228, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py", line 425, in func
return f(iterator)
File "<ipython-input-55-44576a0dc413>", line 2, in valueSum
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/functions.py", line 68, in _
jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
AttributeError: 'NoneType' object has no attribute '_jvm'
func.sum is for use with dataframes, not for summing numbers. Use the Python sum function instead:
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator):
yield sum(iterator)
parallel.mapPartitions(f).collect()
which will give [6, 15, 24].

I get an error while reading multiple multiline JSON files in Databricks

I get an Py4JJavaError message when I'm reading multiple multiline JSON files from a folder. It seems like Spark is struggling while infering a schema from these files.
I tried to reduce the number of files to read since it has to infer from thousands of JSON files but it doesn't seem to work.
def get_user_details_schema(url):
df = sqlContext.read.json(url, multiLine=True)
return df.schema
This is the message I get :
Py4JJavaError Traceback (most recent call last)
<command-2296498238051133> in <module>()
19
20
---> 21 main()
<command-2296498238051133> in main()
15
16
---> 17 process_users(config.user_input_url, config.user_output_url)
18
19
<command-2296498238051133> in process_users(input_url, output_url)
1 def process_users(input_url, output_url):
----> 2 user_df = get_cleansed_users(input_url)
3
4 if not user_df or user_df.rdd.isEmpty():
5 print("User input dataset does not exists or is empty. Nothing to do.")
<command-2296498238051132> in get_cleansed_users(input_url)
16
17 def get_cleansed_users(input_url):
---> 18 df = read_if_exists(input_url, get_user_details_schema(input_url))
19
20 formater_date = udf(format_date)
<command-2296498238051132> in get_user_details_schema(url)
1 def get_user_details_schema(url):
----> 2 df = sqlContext.read.json(url, multiLine=True)
3
4 return df.schema
5
/databricks/spark/python/pyspark/sql/readwriter.py in json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding)
272 path = [path]
273 if type(path) == list:
--> 274 return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
275 elif isinstance(path, RDD):
276 def func(iterator):
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o389.json.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, 10.10.25.4, executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2355)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2343)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2342)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2342)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1096)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1096)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1096)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2574)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2510)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:893)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2243)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2341)
at org.apache.spark.sql.catalyst.json.JsonInferSchema$.infer(JsonInferSchema.scala:83)
at org.apache.spark.sql.execution.datasources.json.MultiLineJsonDataSource$$anonfun$infer$1.apply(JsonDataSource.scala:172)
at org.apache.spark.sql.execution.datasources.json.MultiLineJsonDataSource$$anonfun$infer$1.apply(JsonDataSource.scala:172)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:240)
at org.apache.spark.sql.execution.datasources.json.MultiLineJsonDataSource$.infer(JsonDataSource.scala:171)
at org.apache.spark.sql.execution.datasources.json.JsonDataSource.inferSchema(JsonDataSource.scala:65)
at org.apache.spark.sql.execution.datasources.json.JsonFileFormat.inferSchema(JsonFileFormat.scala:59)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:204)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:195)
at scala.Option.orElse(Option.scala:289)
at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:195)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:412)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:298)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:284)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:467)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
If your cluster is running Databricks Runtime 4.0 and above, you can read JSON files in single-line or multi-line mode. In single-line mode, a file can be split into many parts and read in parallel.
Read JSON files in single-line:
val testJsonData = sqlContext.read.json("/tmp/test.json")
display(testJsonData)
Read JSON files in multi-line:
val testJsonData = sqlContext.read.option("multiline","true").json("/tmp/test.json")
display(testJsonData)
Reference: Azure Databricks - JSON Files
Hope this helps.

pyspark: sort an RDD by the object attribute

I have the following rdd named my_rdd, which looks like:
[FreqSequence(sequence=[['John']], freq=18980),
FreqSequence(sequence=[['Mary']], freq=106),
FreqSequence(sequence=[['John-Mary']], freq=381),
FreqSequence(sequence=[['John-Ann']], freq=158),
FreqSequence(sequence=[['Ann']], freq=433)]
I then tried to sort it like below:
new_rdd = my_rdd.sortBy(lambda x: x.freq)
new_rdd.take(5)
but got the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-15-94c1babd943f> in <module>()
1 print(my_rdd.take(5))
2 new_rdd = my_rdd.sortBy(lambda x: x.freq)
----> 3 new_rdd.take(5)
/usr/local/spark-latest/python/pyspark/rdd.py in take(self, num)
1341
1342 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1343 res = self.context.runJob(self, takeUpToNumLeft, p)
1344
1345 items += res
/usr/local/spark-latest/python/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
963 # SparkContext#runJob.
964 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 965 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
966 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
967
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/local/spark-latest/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 65.0 failed 4 times, most recent failure: Lost task 0.3 in stage 65.0 (TID 115, ph-hdp-inv-dn01, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/worker.py", line 163, in main
func, profiler, deserializer, serializer = read_command(pickleSer, infile)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/worker.py", line 54, in read_command
command = serializer._read_with_length(file)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
return self.loads(obj)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/serializers.py", line 431, in loads
return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'UserString'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Any idea what was wrong here? Thanks!
Your code is correct. Your error:
ImportError: No module named 'UserString'
is raised because UserString is no longer a module in in Python 3.x , but it is a part of the collections modules. This suggest that you either are using an outdated version of PySpark or one of its dependencies is outdated.

What does Exception: Randomness of hash of string should be disabled via PYTHONHASHSEED mean in pyspark?

I am trying to create a dictionary from a list in pyspark. I have the following list of lists:
rawPositions
Gives
[[1009794, 'LPF6 Comdty', 'BC22', 'Enterprise', 3.0, 3904.125, 390412.5],
[1009794, 'LPF6 Comdty', 'BC22', 'Enterprise', 3.0, 3900.75, 390075.0],
[1009794, 'LPF6 Comdty', 'BC22', 'Enterprise', 3.0, 3882.5625, 388256.25],
[1009794, 'LPF6 Comdty', 'BC22', 'Enterprise', 3.0, 3926.25, 392625.0],
[2766232,
'CDX IG CDSI S25 V1 5Y CBBT CORP',
'BC85',
'Enterprise',
30000000.0,
-16323.2439825,
30000000.0],
[2766232,
'CDX IG CDSI S25 V1 5Y CBBT CORP',
'BC85',
'Enterprise',
30000000.0,
-16928.620101900004,
30000000.0],
[1009804, 'LPM6 Comdty', 'BC29', 'Jet', 105.0, 129596.25, 12959625.0],
[1009804, 'LPM6 Comdty', 'BC29', 'Jet', 128.0, 162112.0, 16211200.0],
[1009804, 'LPM6 Comdty', 'BC29', 'Jet', 135.0, 167146.875, 16714687.5],
[1009804, 'LPM6 Comdty', 'BC29', 'Jet', 109.0, 132884.625, 13288462.5]]
Then using my sparkcontext variable sc I parallelize the list
i = sc.parallelize(rawPositions)
#i.collect()
Then I try to turn it into a dictionary by using a groupby function on the 3rd element of each list entry.
j = i.groupBy(lambda x: x[3])
j.collect()
Gives
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-143-6113a75f0a9e> in <module>()
2 #i.collect()
3 j = i.groupBy(lambda x: x[3])
----> 4 j.collect()
/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 50.0 failed 4 times, most recent failure: Lost task 14.3 in stage 50.0 (TID 7583, brllxhtce01.bluecrest.local): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 133, in dump_stream
for obj in iterator:
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/pyspark/rdd.py", line 1703, in add_shuffle_key
buckets[partitionFunc(k) % numPartitions].append((k, v))
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/rdd.py", line 74, in portable_hash
raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
Exception: Randomness of hash of string should be disabled via PYTHONHASHSEED
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.GeneratedMethodAccessor31.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 133, in dump_stream
for obj in iterator:
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/pyspark/rdd.py", line 1703, in add_shuffle_key
buckets[partitionFunc(k) % numPartitions].append((k, v))
File "/net/nas/uxhome/condor_ldrt-s/spark-1.6.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/rdd.py", line 74, in portable_hash
raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
Exception: Randomness of hash of string should be disabled via PYTHONHASHSEED
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
... 1 more
I have no idea what this error refers to... any help would be great!
Since Python 3.2.3+ hash of str, byte and datetime objects in Python is salted using random value to prevent certain kinds of denial-of-service attacks. It means that hash values are consistent inside single interpreter session but differ from session to session. PYTHONHASHSEED sets RNG seed to provide a consistent value between session.
You can easily check this in your shell. If PYTHONHASHSEED is not set you'll get some random values:
unset PYTHONHASHSEED
for i in `seq 1 3`;
do
python3 -c "print(hash('foo'))";
done
## -7298483006336914254
## -6081529125171670673
## -3642265530762908581
but when it is set you'll get the same value on each execution:
export PYTHONHASHSEED=323
for i in `seq 1 3`;
do
python3 -c "print(hash('foo'))";
done
## 8902216175227028661
## 8902216175227028661
## 8902216175227028661
Since groupBy and other operations which depend on default partitioner use hashing you need the same value of PYTHONHASHSEED on all machines in the cluster to get consistent results.
See also:
Python Setup and Usage ยป Command line and environment
oCERT 2011-003 multiple implementations denial-of-service via hash algorithm collision
Check in Spark Configuration https://spark.apache.org/docs/latest/configuration.html#loading-default-configurations Runtime Environment part.
When running:
$SPARK_HOME/bin/spark-submit
Add:
--conf spark.executorEnv.PYTHONHASHSEED=321
To do this from within python (rather than having to go back to your terminal, you can do):
os.environ["PYTHONHASHSEED"]=str(232)
with some integer of your choice. (I chose 232 for a quick example.)
Such as:
>>> for el in nums:
... print("Element: [{}]: {} % {} = partition {}".format(
... el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions))
...
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
File "/home/osdi-eval/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 94, in portable_hash
raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED")
>>>
>>> # Now fix this buy adding:
>>> os.environ["PYTHONHASHSEED"]=str(232)
>>>
>>> for el in nums:
... print("Element: [{}]: {} % {} = partition {}".format(
... el, portable_hash(el), num_partitions, portable_hash(el) % num_partitions))
...
Element: [(0, 0)]: 3430028580078870074 % 2 = partition 0
Element: [(1, 1)]: 3430029580083870076 % 2 = partition 0

error in tf-idf implementation mllib spark

I'm trying to perform an TF-IDF transformations from some twitters to subsequently apply naive bayes to it, I have the following RDD after applying stopwords and stemming to the twitters:
[u'neutro, marc line polit ibex',
u'neutro, ahor hac mas firm redact jef ibex dad result',
u'neutro, temblor ml am epicentr sant santand repo endesarroll',
u'neutro, cambi tiemp santand ciel cubie temperatur',
u'neutro, sabi pued recobr inversion bon pr perd caus mal asesor ubs santand popul u oriental',
u'neutro, renunci vital sal crisis',
u'neutro, ibex sub punt',
u'neutro, dias acampahd delant bbva manlleu dias luch i gan batall luch continu',
u'neutro, mas natural repsol seri cobr dividend',
u'neutro, luch ibex financi carin obedient',
u'neutro, clav triunf basket cumpl futbol lig bbva via',
u'neutro, colombi despleg primer red viual comercial lte pais',
u'neutro, resistent clav bat repsol encontr eur',
u'neutro, telefon lanz servici vide siet pais latinoamerican inclu',
u'neutro, result empat gol calderon cierr jorn lig bbva lalig']
But when I apply the following code to the RDD:
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
documents = trainingSet_cleaned.map(lambda line: line.split(' '))
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
#tfidf = idf.transform(tf)
I get the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-291-69a2d82a8484> in <module>()
6 tf = hashingTF.transform(documents)
7 tf.cache()
----> 8 idf = IDF(minDocFreq=2).fit(tf)
9 #tfidf = idf.transform(tf)
10
/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/pyspark/mllib/feature.py in fit(self, dataset)
414 if not isinstance(dataset, RDD):
415 raise TypeError("dataset should be an RDD of term frequency vectors")
--> 416 jmodel = callMLlibFunc("fitIDF", self.minDocFreq, dataset.map(_convert_to_vector))
417 return IDFModel(jmodel)
418
/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
121 """ Call Java Function """
122 args = [_py2java(sc, a) for a in args]
--> 123 return _java2py(sc, func(*args))
124
125
/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o1296.fitIDF.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 130.0 failed 1 times, most recent failure: Lost task 0.0 in stage 130.0 (TID 130, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "<ipython-input-249-8d34bb5694f4>", line 1, in <lambda>
IndexError: list index out of range
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.next(PythonRDD.scala:101)
at org.apache.spark.api.python.PythonRDD$$anon$1.next(PythonRDD.scala:97)
at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:278)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:262)
at org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:249)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1699)
at org.apache.spark.api.python.PythonRDD$WriterThread.run(PythonRDD.scala:208)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1294)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1282)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1281)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1281)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1507)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1469)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1003)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.reduce(RDD.scala:985)
at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1114)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1091)
at org.apache.spark.mllib.feature.IDF.fit(IDF.scala:56)
at org.apache.spark.mllib.feature.IDF.fit(IDF.scala:69)
at org.apache.spark.mllib.api.python.PythonMLLibAPI.fitIDF(PythonMLLibAPI.scala:602)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.5.1-1.cdh5.5.1.p0.11/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "<ipython-input-249-8d34bb5694f4>", line 1, in <lambda>
IndexError: list index out of range
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.next(PythonRDD.scala:101)
at org.apache.spark.api.python.PythonRDD$$anon$1.next(PythonRDD.scala:97)
at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:278)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:262)
at org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:249)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1699)
at org.apache.spark.api.python.PythonRDD$WriterThread.run(PythonRDD.scala:208)
I don't know what I'm doing wrong because I'm applying literally the functions explained in spark api.
Any clue?

Resources