PySpark - Error when checking if I have NaN in some columns - python-3.x

Try to check if I have NaN value in some columns with
ddf_temp = ddf.select('col1', 'col2' ...) # all int type
ddf_temp.select([count(when(isnull(c), c)).alias(c) for c in ddf_temp.columns]).show()
I could isolate which columns gives me those error but I cannot find out why I got this :
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-47-76c75cf06695> in <module>()
3 # ddf_temp = ddf10.select('state_bottle_cost')
4 ddf_temp = ddf10.where(col('state_bottle_retail').isNull())
----> 5 ddf_temp.show()
6 # ddf_temp = ddf10.select('store_number', 'zip_code', 'county_number', 'category', 'vendor_number', 'pack', 'bottles_sold')
7 # ddf_temp.select([count(when(isnull(c), c)).alias(c) for c in ddf_temp.columns]).show()
3 frames
/content/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o2010.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 43.0 failed 1 times, most recent failure: Lost task 0.0 in stage 43.0 (TID 233, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
process()
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 345, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 141, in dump_stream
for obj in iterator:
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 334, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
return lambda *a: f(*a)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-11-9ec9e286520d>", line 3, in <lambda>
TypeError: 'NoneType' object is not subscriptable
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:224)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:345)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:194)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
at sun.reflect.GeneratedMethodAccessor122.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
process()
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 345, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 141, in dump_stream
for obj in iterator:
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 334, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
return lambda *a: f(*a)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-11-9ec9e286520d>", line 3, in <lambda>
TypeError: 'NoneType' object is not subscriptable
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:224)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeIteratorToStream(PythonUDFRunner.scala:50)
at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:345)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:194)
EDIT :
remove_first_char = udf(lambda x: x[1:])
ddf4 = ddf3.withColumn('State Bottle Cost', remove_first_char('State Bottle Cost'))
multiply_by_100 = udf(lambda x: x*100)
ddf5 = ddf4.withColumn('State Bottle Cost', ddf4['State Bottle Cost'].cast(DoubleType()))
ddf5 = ddf5.withColumn('State Bottle Cost', multiply_by_100('State Bottle Cost'))
ddf5 = ddf5.withColumn('State Bottle Cost', ddf5['State Bottle Cost'].cast(IntegerType()))

You have Nones in your dataframe and by applying the UDF it will execute None[1:] which gives you the error TypeError: 'NoneType' object is not subscriptable (you can try it in a python shell).
When using built-in pyspark functions it will always map null->null. If you would want to do it via UDF (which is not recommended since spark does internal optimization on built-in sql functions), you would need to catch the None case: lambda x: x if not x else x[1:]

Related

Running spacy in pyspark, but getting ModuleNotFoundError: No module named 'spacy'

I have a venv set up and running 'conda list' in cmd window I can see spacy 2.2.4, en-core-web-sm 2.2.5, pyspark 2.4.5 among other packages.
I want to apply spacy's named entity recognition to a column in a Spark Dataframe called 'tweets' within Jupyter Notebook. I am doing this by passing the NER into a Spark UDF, and passing that UDF into the withColumn operation:
import spacy
nlp = spacy.load("en_core_web_sm")
def spacy_ner(text_col):
entities = []
for parsed in nlp.pipe(text_col):
if parsed.ents:
inner = ''
for ent in parsed.ents:
inner += ent.text + ''
else:
inner = ''
entities.append(inner)
return entities
ner = udf(spacy_ner)
ner = sqlContext.udf.register("ner", spacy_ner)
tweets = tweets.withColumn('text_ner',ner('text_nosw'))
result = tweets.groupBy(tweets.text_ner).count()
result.show()
However I'm getting "ModuleNotFoundError: No module named 'spacy'":
Py4JJavaError: An error occurred while calling o1006.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 43.0 failed 1 times, most recent failure: Lost task 0.0 in stage 43.0 (TID 289, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 366, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 241, in read_udfs
arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 168, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
command = serializer.loads(command.value)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'spacy'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 366, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 241, in read_udfs
arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 168, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
command = serializer.loads(command.value)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'spacy'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-30-9fbae11f2e36> in <module>
4 tweets = tweets.withColumn('text_ner',ner('text_nosw'))
5 result = tweets.groupBy(tweets.text_ner).count()
----> 6 result.show()
~/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
378 """
379 if isinstance(truncate, bool) and truncate:
--> 380 print(self._jdf.showString(n, 20, vertical))
381 else:
382 print(self._jdf.showString(n, int(truncate), vertical))
~/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o1006.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 43.0 failed 1 times, most recent failure: Lost task 0.0 in stage 43.0 (TID 289, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 366, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 241, in read_udfs
arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 168, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
command = serializer.loads(command.value)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'spacy'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 366, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 241, in read_udfs
arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 168, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 71, in read_command
command = serializer.loads(command.value)
File "/Users/Noah/anaconda3/envs/pyspark_env/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'spacy'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Coming back to this. It appears that I needed to restart my Spark session in the end, so I close this. I don't really understand what was happening, but it's not an open issue anymore.

How can I apply the reduceByKey function to a .distinct() object in PySpark?

I am currently trying to create a boolean index for the collection [(1, "winter is coming"), (2, "ours is the fury"), (3, "the old the true the brave")] using PySpark, where the output is key-value pairs in which the keys are the unique words, while the values are lists of the original keys of the collection which contain the words.
First, I parallelised my collection using the following code:
collection=sc.parallelize([(1, "winter is coming"), (2, "ours is the fury"), (3, "the old the true the brave")])
Then, I proceeded to create the index using the following code:
collection.map(lambda x:(x[0],x[1].split(" "))).flatMapValues(lambda x:x).map(lambda x:(x[1],[x[0]])).distinct().reduceByKey(lambda x,y:x+y).collect()
However, after running the line I got an error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-184-d6bc1884fb69> in <module>()
1 collection=sc.parallelize([(1, "winter is coming"), (2, "ours is the fury"), (3, "the old the true the brave")])
----> 2 collection.map(lambda x:(x[0],x[1].split(" "))).flatMapValues(lambda x:x).map(lambda x:(x[1],[x[0]])).distinct().reduceByKey(lambda x,y:x+y).collect()
3 frames
/content/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 150.0 failed 1 times, most recent failure: Lost task 1.0 in stage 150.0 (TID 260, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/content/spark-2.4.5-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
process()
File "/content/spark-2.4.5-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 352, in func
return f(iterator)
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 1861, in combineLocally
merger.mergeValues(iterator)
File "/content/spark-2.4.5-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
d[k] = comb(d[k], v) if k in d else creator(v)
TypeError: unhashable type: 'list'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.GeneratedMethodAccessor45.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/content/spark-2.4.5-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
process()
File "/content/spark-2.4.5-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 352, in func
return f(iterator)
File "/content/spark-2.4.5-bin-hadoop2.7/python/pyspark/rdd.py", line 1861, in combineLocally
merger.mergeValues(iterator)
File "/content/spark-2.4.5-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/shuffle.py", line 240, in mergeValues
d[k] = comb(d[k], v) if k in d else creator(v)
TypeError: unhashable type: 'list'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
I removed the .distinct() attribute i.e.
collection.map(lambda x:(x[0],x[1].split(" "))).flatMapValues(lambda x:x).map(lambda x:(x[1],[x[0]])).reduceByKey(lambda x,y:x+y).collect()
and everything was fine except that the key 'the' included duplicate values of the key of the original tuple it was in. i.e. 3, as can be seen below:
[('is', [1, 2]),
('true', [3]),
('brave', [3]),
('winter', [1]),
('coming', [1]),
('ours', [2]),
('the', [2, 3, 3, 3]),
('fury', [2]),
('old', [3])]
So my question is, how can I remove the duplicate values for the key 'the' before feeding it into the reduceByKey() function? Thanks in advance!!
You could dedupe the list that is created by split() to get the desired result.
collection.map(lambda x:(x[0],list(set(x[1].split(" "))))).flatMapValues(lambda x:x).map(lambda x:(x[1],[x[0]])).reduceByKey(lambda x,y:x+y).collect()
output: [('fury', [2]), ('true', [3]), ('is', [1, 2]), ('old', [3]), ('the', [2, 3]), ('ours', [2]), ('brave', [3]), ('winter', [1]), ('coming', [1])]

Getting errors parsing text using Spacy using PySpark and Jupyter

I am trying to parse some text using spacy to get word dependencies. I am running PySpark in Anaconda with Jupyter notebooks.
Python version: 3.7.5
PySpark version: 2.4.4
Spacy version: 2.2.5
Anaconda version: 4.7.12
Jupyter version: 6.0.2
Here's a MVCE for the error:
import spacy
import en_core_web_sm
from pyspark.sql.functions import *
from pyspark.sql.types import *
def get_token_dep(text):
if text:
nlp = en_core_web_sm.load()
return [(token.text, token.tag_, token.head.text, token.dep_) for token in nlp(text)]
else:
return [['N/A']]
get_token_dep_udf = udf(get_token_dep, ArrayType(ArrayType(StringType())))
text_list = ['Chocolate is a food made from cacao beans.', 'Dessert is a course that concludes a meal.']
text_df = spark.createDataFrame(text_list, StringType())
text_df = text_df.withColumnRenamed(
'value', 'text'
).withColumn(
'parsed_text', get_token_dep_udf('text')
)
display(text_df.toPandas())
However, I am getting errors as follows:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-14-bc4e37a4051a> in <module>
----> 1 display(text_df.toPandas())
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
2141
2142 # Below is toPandas without Arrow optimization.
-> 2143 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
2144
2145 dtype = {}
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
532 """
533 with SCCallSiteSync(self._sc) as css:
--> 534 sock_info = self._jdf.collectToPython()
535 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
536
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o147.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 11.0 failed 1 times, most recent failure: Lost task 7.0 in stage 11.0 (TID 47, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 8, in <module>
import importlib.metadata as importlib_metadata
ModuleNotFoundError: No module named 'importlib.metadata'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 366, in main
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 241, in read_udfs
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 168, in read_single_udf
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 580, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\en_core_web_sm\__init__.py", line 5, in <module>
from spacy.util import load_model_from_init_py, get_model_meta
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\__init__.py", line 12, in <module>
from . import pipeline
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\pipeline\__init__.py", line 4, in <module>
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
File "pipes.pyx", line 1, in init spacy.pipeline.pipes
File "strings.pxd", line 23, in init spacy.syntax.nn_parser
File "strings.pyx", line 17, in init spacy.strings
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\util.py", line 16, in <module>
import catalogue
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 10, in <module>
import importlib_metadata
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\\C:\\Users\\user1\\AppData\\Local\\Continuum\\anaconda3\\envs\\py37\\Lib\\site-packages\\pyspark\\jars\\spark-core_2.11-2.4.4.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 8, in <module>
import importlib.metadata as importlib_metadata
ModuleNotFoundError: No module named 'importlib.metadata'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 366, in main
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 241, in read_udfs
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 168, in read_single_udf
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 580, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\en_core_web_sm\__init__.py", line 5, in <module>
from spacy.util import load_model_from_init_py, get_model_meta
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\__init__.py", line 12, in <module>
from . import pipeline
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\pipeline\__init__.py", line 4, in <module>
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
File "pipes.pyx", line 1, in init spacy.pipeline.pipes
File "strings.pxd", line 23, in init spacy.syntax.nn_parser
File "strings.pyx", line 17, in init spacy.strings
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\util.py", line 16, in <module>
import catalogue
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 10, in <module>
import importlib_metadata
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\\C:\\Users\\user1\\AppData\\Local\\Continuum\\anaconda3\\envs\\py37\\Lib\\site-packages\\pyspark\\jars\\spark-core_2.11-2.4.4.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
I have tried upgrading Python to 3.8 but Jupyter notebooks doesn't support the newer Python version yet. Anyone able to get spacy to work with PySpark on Jupyter notebooks?
Part of the error points to https://github.com/explosion/catalogue/blob/master/catalogue.py#L7, where the import of importlib.metadata seems to go wrong, but not with the expected error type ImportError. I'll make a PR to include the ModuleNotFoundError and let's hope that would fix the issue!
[EDIT:] Hm, ModuleNotFoundError is a subclass of ImportError so I don't understand why that is not properly caught in the except block :|
[EDIT 2:] Logged an issue https://github.com/explosion/catalogue/issues/4 in case this is indeed related to catalogue.py

How to resolve a SparkException when using a User-Defined function?

I need to detect a language by text, and translate that text using PySpark. I could not find any functions for this in PySpark so I created my own UDF's.
Language detection
def detectlang(string):
b = TextBlob(string)
return b.detect_language()
detectlang_udf = udf(detectlang)
Translation
def translate(string):
trans = Translator()
return trans.translate(string).text
translate_udf = udf(translate, StringType())
However when I call these functions and then ask for the result I get the following error:
result = dict_comments[13].withColumn("lang", detectlang_udf(col('Text')))
result.show()
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 15, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
Edit (full error)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-256375544159477> in <module>
1 result = dict_comments[13].withColumn("lang", detectlang_udf(col('Text')))
----> 2 result.show()
/databricks/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
379 """
380 if isinstance(truncate, bool) and truncate:
--> 381 print(self._jdf.showString(n, 20, vertical))
382 else:
383 print(self._jdf.showString(n, int(truncate), vertical))
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o872.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 15, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 480, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 472, in process
serializer.dump_stream(out_iter, outfile)
File "/databricks/spark/python/pyspark/serializers.py", line 456, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/databricks/spark/python/pyspark/serializers.py", line 149, in dump_stream
for obj in iterator:
File "/databricks/spark/python/pyspark/serializers.py", line 445, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/databricks/spark/python/pyspark/worker.py", line 87, in <lambda>
return lambda *a: f(*a)
File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<command-256375544159470>", line 3, in detectlang
File "/databricks/python/lib/python3.7/site-packages/textblob/blob.py", line 568, in detect_language
return self.translator.detect(self.raw)
File "/databricks/python/lib/python3.7/site-packages/textblob/translate.py", line 69, in detect
raise TranslatorError('Must provide a string with at least 3 characters.')
textblob.exceptions.TranslatorError: Must provide a string with at least 3 characters.
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:534)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:488)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:640)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:159)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:158)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:140)
at org.apache.spark.scheduler.Task.run(Task.scala:113)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:528)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1526)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:534)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2360)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2348)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2347)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2347)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1101)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1101)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1101)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2579)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2527)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2515)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:896)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2280)
at org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:270)
at org.apache.spark.sql.execution.collect.Collector.collect(Collector.scala:280)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:80)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:86)
at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:508)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:55)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectResult(Dataset.scala:2889)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3501)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2618)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2618)
at org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3485)
at org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3480)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:111)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:240)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:97)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:170)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3480)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2618)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2832)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:265)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:302)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 480, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 472, in process
serializer.dump_stream(out_iter, outfile)
File "/databricks/spark/python/pyspark/serializers.py", line 456, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/databricks/spark/python/pyspark/serializers.py", line 149, in dump_stream
for obj in iterator:
File "/databricks/spark/python/pyspark/serializers.py", line 445, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/databricks/spark/python/pyspark/worker.py", line 87, in <lambda>
return lambda *a: f(*a)
File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<command-256375544159470>", line 3, in detectlang
File "/databricks/python/lib/python3.7/site-packages/textblob/blob.py", line 568, in detect_language
return self.translator.detect(self.raw)
File "/databricks/python/lib/python3.7/site-packages/textblob/translate.py", line 69, in detect
raise TranslatorError('Must provide a string with at least 3 characters.')
textblob.exceptions.TranslatorError: Must provide a string with at least 3 characters.
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:534)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:488)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:640)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:159)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:158)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:140)
at org.apache.spark.scheduler.Task.run(Task.scala:113)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:528)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1526)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:534)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Does anybody know how to resolve this or know of any pre-implemented PySpark functions for my goal?
Based on the latest edit, here is the source of your task failure.
File "/databricks/python/lib/python3.7/site-packages/textblob/translate.py", line 69, in detect
raise TranslatorError('Must provide a string with at least 3 characters.')
textblob.exceptions.TranslatorError: Must provide a string with at least 3 characters.
Spark can sometimes be quite unhelpful in providing errors, so here you need to search all the way for your python exception, as the driver will only say something like Task Lost or Task Failed.
A quick fix to your function is then checking on the len() of the input string, or adding a f.when().otherwise() in pyspark on your function call - this might be more desirable as you will not execute your udf when not needed.
Hope this helps!

Pyspark error ValueError: not enough values to unpack (expected 2, got 1) when trying to group with groupByKey

I downloaded a text file from this site: http://snap.stanford.edu/data/web-Amazon-links.html with the intent to do some text analytics in Pyspark.
So I set up my spark context:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('app')
sc = SparkContext(conf=conf)
from pyspark.sql import SQLContext
I grabbed the file:
Data1 =sc.textFile('/home/john/Downloads/Software.txt.gz').map(lambda line: line.split(','))
The data looks like this:
[['product/productId: B000068VBQ'],
['product/title: Fisher-Price Rescue Heroes: Lava Landslide'],
['product/price: 8.88'],
['review/userId: unknown'],
['review/profileName: unknown'],
['review/helpfulness: 11/11'],
['review/score: 2.0'],
['review/time: 1042070400'],
['review/summary: Requires too much coordination'],
['review/text: I bought this software for my 5 year old. He has a couple of the other RH software games and he likes them a lot. This game',
' however'
But then I try the groupByKey:
sorted(Data1.groupByKey().mapValues(list).collect())
And I get this error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-15-a3c92709547a> in <module>
----> 1 sorted(Data1.groupByKey().mapValues(list).collect())
~/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py in collect(self)
814 """
815 with SCCallSiteSync(self.context) as css:
--> 816 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
817 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
818
~/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/anaconda3/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 4, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
process()
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 352, in func
return f(iterator)
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 1945, in combine
merger.mergeValues(iterator)
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/shuffle.py", line 238, in mergeValues
for k, v in iterator:
ValueError: not enough values to unpack (expected 2, got 1)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in main
process()
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 367, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 352, in func
return f(iterator)
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/rdd.py", line 1945, in combine
merger.mergeValues(iterator)
File "/home/john/anaconda3/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/shuffle.py", line 238, in mergeValues
for k, v in iterator:
ValueError: not enough values to unpack (expected 2, got 1)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
​
The problem is in your data and the map you used.
The principle of groupByKey is to use a key and a value to group by key and perform some aggregation on your value data.
But in your RDD, you do not have that key --> value data, just a list of lists ... that is the reason why you have this error.
The list is the 1 argument causing the error message.
I do not know exactly your data and what you want to achieve, but I think you could do something like that for example :
Data1 =sc.textFile('/home/john/Downloads/Software.txt.gz').flatMap(lambda line: line.split("', '"))
Data2 = Data1.map(lambda line : line.split(':')).filter(lambda x : len(x)==2)
sorted(Data2.groupByKey().mapValues(set).collect())

Resources