Add date field to RDD in Spark - apache-spark

I have a pretty simple RDD called STjoin on which I pass a simple function to get the day out of a string representing the date-time.
The code passes lazy evaluation, but if I run the last line (STjoinday.take(5)), I get an error.
def parsedate(x):
try:
dt=dateutil.parser.parse(x[1]).date()
except:
dt=dateutil.parser.parse("01 Jan 1900 00:00:00").date()
x.append(dt)
return x
STjoinday=STjoin.map(lambda line: parsedate(line))
#STjoinday.take(5)
What is the problem here?
Long error traceback below:
15/04/27 22:14:02 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 8)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/worker.py", line 79, in main
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 196, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 127, in dump_stream
for obj in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 185, in _batched
for item in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/rdd.py", line 1147, in takeUpToNumLeft
yield next(iterator)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/test3.py", line 72, in parsedate
dt=dateutil.parser.parse("01 Jan 1900 00:00:00").date()
AttributeError: 'module' object has no attribute 'parser'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:124)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:154)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:87)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:54)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:178)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/04/27 22:14:02 ERROR TaskSetManager: Task 0 in stage 6.0 failed 1 times; aborting job
Traceback (most recent call last):
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/test3.py", line 79, in <module>
STjoinday.take(5)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/rdd.py", line 1152, in take
res = self.context.runJob(self, takeUpToNumLeft, p, True)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/context.py", line 770, in runJob
it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 8, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/worker.py", line 79, in main
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 196, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 127, in dump_stream
for obj in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 185, in _batched
for item in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/rdd.py", line 1147, in takeUpToNumLeft
yield next(iterator)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/test3.py", line 72, in parsedate
dt=dateutil.parser.parse("01 Jan 1900 00:00:00").date()
AttributeError: 'module' object has no attribute 'parser'
org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:124)
org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:154)
org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:87)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
org.apache.spark.scheduler.Task.run(Task.scala:54)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:178)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1185)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1174)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1173)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1173)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:688)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1391)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

As pointed out in other answers and comments, the problem is with the importation of dateutils. I found a way that works, even though I am not sure why the others fail. Instead of the above:
from dateutil.parser import parse as parse_date
then use:
dt=parse_date("01 Jan 1900 00:00:00").date()

Looks like dateutil is not a standard python pkg. You need to distribute it to every worker node.
Can you post what happens when you just import dateutil after running python shell? May be you are missing some entry in PYTHONPATH

Related

PySpark task exception handling

I have a loop in a pyspark (Spark3 cluster) task like this :
def myfunc(rows):
#Some dynamodb Table initiation stuff (nothing fancy)
with table.batch_writer() as batch:
for row in rows():
try:
batch.put_item(..)
except ClientError as e:
if e.response['Error']['Code'] == "ProvisionedThroughputExceededException":
#handle the issue here
And here is the call to this function from spark
df.foreachPartition(lambda x : myfunc(x))
This code actually works fine. Sometime I receive the exception ProvisionedThroughputExceededException and it's handled. However something super weird is that, if the task handling the bunch of rows seems to encounter the exception it will end as a failing task eventhough the excpetion has been handled, as if spark task check some kind of historical exception to see if something bad happend during the processing:
Here the output from the task :
Getting An error occurred (ProvisionedThroughputExceededException) when calling the BatchWriteItem operation ... ==> handling
Getting An error occurred (ProvisionedThroughputExceededException) when calling the BatchWriteItem operation ... ==> handling
Getting An error occurred (ProvisionedThroughputExceededException) when calling the BatchWriteItem operation ... ==> handling
2022-03-30 08:40:33,029 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 9)
and then it prints out the stack trace as follows
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000004/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000004/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000001/pyspark.zip/pyspark/rdd.py", line 2596, in pipeline_func
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000001/pyspark.zip/pyspark/rdd.py", line 2596, in pipeline_func
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000001/pyspark.zip/pyspark/rdd.py", line 2596, in pipeline_func
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000001/pyspark.zip/pyspark/rdd.py", line 425, in func
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000001/pyspark.zip/pyspark/rdd.py", line 874, in func
File "6.YL_flow_2-ecf3d86.py", line 136, in <lambda>
File "6.YL_flow_2-ecf3d86.py", line 98, in greedy_dyn_send
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000004/env/lib/python3.7/site-packages/boto3/dynamodb/table.py", line 156, in __exit__
self._flush()
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000004/env/lib/python3.7/site-packages/boto3/dynamodb/table.py", line 137, in _flush
RequestItems={self._table_name: items_to_send})
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000004/env/lib/python3.7/site-packages/botocore/client.py", line 388, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/srv/ssd2/yarn/nm/usercache/svc_df_omni/appcache/application_1648119616278_365920/container_e298_1648119616278_365920_01_000004/env/lib/python3.7/site-packages/botocore/client.py", line 708, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.errorfactory.ProvisionedThroughputExceededException: An error occurred (ProvisionedThroughputExceededException) when calling the BatchWriteItem operation (reached max retries: 1): The level of configured provisioned throughput for the table was exceeded. Consider increasing your provisioning level with the UpdateTable API.
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2154)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:462)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:465)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
2022-03-30 08:40:33,089 INFO YarnCoarseGrainedExecutorBackend: Got assigned task 73
So I was wondering how spark handle "finishing" a task. Will it say that the task is failed if we encounter an exception and handled it ? Should we clean something whenever we handle an exception ?

Pandas UDF for pyspark - Package not found error

I am using the pandas UDF approach to scale my models. However, I am getting an error with the pmdarima package not found. The code works fine till I run it on my notebook on the pandas dataframe itself. So the package is available for use in the notebook. From few answers online, the error seems in package not being available on the worker nodes where the code is trying to parallelize. Can someone help on how to resolve this? How can I also install the package on my worker nodes, if that's the case.
FYI - I am working on Azure Databricks.
def funct1(grp_keys, df):
other statements
model = pm.auto_arima(train_data['sum_hlqty'],X=x,
test='adf',trace=False,
maxiter = 12,max_p=5,max_q=5,
njobs=-1)
forecast_df = sales.groupby('Col1','Col2').applyInPandas(funct1,schema="C1 string, C2 string, C3 date, C4 float, C5 float")
Py4JJavaError: An error occurred while calling o256.sql.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:230)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.$anonfun$writeFiles$5(TransactionalWriteEdge.scala:183)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:249)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:199)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.$anonfun$writeFiles$1(TransactionalWriteEdge.scala:135)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$4(UsageLogging.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.spark.util.PublicDBLogging.withAttributionContext(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.spark.util.PublicDBLogging.withAttributionTags(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:412)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:338)
at com.databricks.spark.util.PublicDBLogging.recordOperation(DatabricksSparkUsageLogger.scala:19)
at com.databricks.spark.util.PublicDBLogging.recordOperation0(DatabricksSparkUsageLogger.scala:56)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:129)
at com.databricks.spark.util.UsageLogger.recordOperation(UsageLogger.scala:71)
at com.databricks.spark.util.UsageLogger.recordOperation$(UsageLogger.scala:58)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:85)
at com.databricks.spark.util.UsageLogging.recordOperation(UsageLogger.scala:401)
at com.databricks.spark.util.UsageLogging.recordOperation$(UsageLogger.scala:380)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.recordOperation(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:108)
at com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:94)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.recordDeltaOperation(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.writeFiles(TransactionalWriteEdge.scala:92)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.writeFiles$(TransactionalWriteEdge.scala:88)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:112)
at com.databricks.sql.transaction.tahoe.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:111)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.write(WriteIntoDelta.scala:112)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.$anonfun$run$2(WriteIntoDelta.scala:71)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.$anonfun$run$2$adapted(WriteIntoDelta.scala:70)
at com.databricks.sql.transaction.tahoe.DeltaLog.withNewTransaction(DeltaLog.scala:203)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.$anonfun$run$1(WriteIntoDelta.scala:70)
at com.databricks.sql.acl.CheckPermissions$.trusted(CheckPermissions.scala:1128)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.run(WriteIntoDelta.scala:69)
at com.databricks.sql.transaction.tahoe.catalog.WriteIntoDeltaBuilder$$anon$1.insert(DeltaTableV2.scala:193)
at org.apache.spark.sql.execution.datasources.v2.SupportsV1Write.writeWithV1(V1FallbackWriters.scala:118)
at org.apache.spark.sql.execution.datasources.v2.SupportsV1Write.writeWithV1$(V1FallbackWriters.scala:116)
at org.apache.spark.sql.execution.datasources.v2.AppendDataExecV1.writeWithV1(V1FallbackWriters.scala:38)
at org.apache.spark.sql.execution.datasources.v2.AppendDataExecV1.run(V1FallbackWriters.scala:44)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:45)
at org.apache.spark.sql.Dataset.$anonfun$logicalPlan$1(Dataset.scala:234)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3709)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:249)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:199)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3707)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:234)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:101)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:680)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:675)
at sun.reflect.GeneratedMethodAccessor655.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 98 in stage 7774.0 failed 4 times, most recent failure: Lost task 98.3 in stage 7774.0 (TID 177293, 10.240.138.10, executor 133): org.apache.spark.api.python.PythonException: 'pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 177, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 466, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
**ModuleNotFoundError: No module named 'pmdarima''** Full traceback below:
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 177, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 466, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
ModuleNotFoundError: No module named 'pmdarima'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 638, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/databricks/spark/python/pyspark/worker.py", line 438, in read_udfs
arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0)
File "/databricks/spark/python/pyspark/worker.py", line 255, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/databricks/spark/python/pyspark/worker.py", line 75, in read_command
command = serializer._read_with_length(file)
File "/databricks/spark/python/pyspark/serializers.py", line 180, in _read_with_length
raise SerializationError("Caused by " + traceback.format_exc())
pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 177, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 466, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
**ModuleNotFoundError: No module named 'pmdarima'****strong text**

java.io.IOException: No FileSystem for scheme: C and WinError 10054: An existing connection was forcibly closed by the remote host

I was trying to Connect and Fetch data from BigQuery Dataset to Local Pycharm Using Pyspark.
I ran this below Script in Pycharm:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.config('spark.jars', "C:/Users/PycharmProjects/pythonProject/spark-bigquery-latest.jar")\
.getOrCreate()
conn = spark.read.format("bigquery")\
.option("credentialsFile", "C:/Users/PycharmProjects/pythonProject/google-bq-api.json")\
.option("parentProject", "Google-Project-ID")\
.option("project", "Dataset-Name")\
.option("table", "dataset.schema.tablename")\
.load()
conn.show()
For this I got the below error:
Exception in thread "main" java.io.IOException: No FileSystem for scheme: C
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.spark.deploy.DependencyUtils$.resolveGlobPath(DependencyUtils.scala:191)
at org.apache.spark.deploy.DependencyUtils$.$anonfun$resolveGlobPaths$2(DependencyUtils.scala:147)
at org.apache.spark.deploy.DependencyUtils$.$anonfun$resolveGlobPaths$2$adapted(DependencyUtils.scala:145)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:108)
at org.apache.spark.deploy.DependencyUtils$.resolveGlobPaths(DependencyUtils.scala:145)
at org.apache.spark.deploy.SparkSubmit.$anonfun$prepareSubmitEnvironment$4(SparkSubmit.scala:363)
at scala.Option.map(Option.scala:230)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:363)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:871)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1007)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1016)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Traceback (most recent call last):
File "C:\Users\naveen.chandar\PycharmProjects\pythonProject\BigQueryConnector.py", line 4, in <module>
spark = SparkSession.builder.config('spark.jars', 'C:/Users/naveen.chandar/PycharmProjects/pythonProject/spark-bigquery-latest.jar').getOrCreate()
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\sql\session.py", line 186, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 376, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 133, in __init__
SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 325, in _ensure_initialized
SparkContext._gateway = gateway or launch_gateway(conf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\java_gateway.py", line 105, in launch_gateway
raise Exception("Java gateway process exited before sending its port number")
Exception: Java gateway process exited before sending its port number
So, I researched and tried it from different Diecrtory like "D-drive" and also tried to fix a static port with set PYSPARK_SUBMIT_ARGS="--master spark://<IP_Address>:<Port>", but still I got the same error in Pycharm.
Then I thought of trying the same script in local Command Prompt under Pyspark and I got this error:
failed to find class org/conscrypt/CryptoUpcalls
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1152, in send_command
answer = smart_decode(self.stream.readline()[:-1])
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python37\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 985, in send_command
response = connection.send_command(command)
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1164, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\dataframe.py", line 381, in show
print(self._jdf.showString(n, 20, vertical))
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
File "D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 336, in get_return_value
py4j.protocol.Py4JError: An error occurred while calling o42.showString
My Python Version is 3.7.9 and Spark Version is 2.4.7
So either way I ran out of idea's and I appreciate some help on any one of the situation I facing...
Thanks In Advance!!
Start your file system references with file:///c:/...
You need to replace / with \ for the path to work

How to resolve a SparkException when using a User-Defined function?

I need to detect a language by text, and translate that text using PySpark. I could not find any functions for this in PySpark so I created my own UDF's.
Language detection
def detectlang(string):
b = TextBlob(string)
return b.detect_language()
detectlang_udf = udf(detectlang)
Translation
def translate(string):
trans = Translator()
return trans.translate(string).text
translate_udf = udf(translate, StringType())
However when I call these functions and then ask for the result I get the following error:
result = dict_comments[13].withColumn("lang", detectlang_udf(col('Text')))
result.show()
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 15, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
Edit (full error)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-256375544159477> in <module>
1 result = dict_comments[13].withColumn("lang", detectlang_udf(col('Text')))
----> 2 result.show()
/databricks/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
379 """
380 if isinstance(truncate, bool) and truncate:
--> 381 print(self._jdf.showString(n, 20, vertical))
382 else:
383 print(self._jdf.showString(n, int(truncate), vertical))
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o872.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 15, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 480, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 472, in process
serializer.dump_stream(out_iter, outfile)
File "/databricks/spark/python/pyspark/serializers.py", line 456, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/databricks/spark/python/pyspark/serializers.py", line 149, in dump_stream
for obj in iterator:
File "/databricks/spark/python/pyspark/serializers.py", line 445, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/databricks/spark/python/pyspark/worker.py", line 87, in <lambda>
return lambda *a: f(*a)
File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<command-256375544159470>", line 3, in detectlang
File "/databricks/python/lib/python3.7/site-packages/textblob/blob.py", line 568, in detect_language
return self.translator.detect(self.raw)
File "/databricks/python/lib/python3.7/site-packages/textblob/translate.py", line 69, in detect
raise TranslatorError('Must provide a string with at least 3 characters.')
textblob.exceptions.TranslatorError: Must provide a string with at least 3 characters.
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:534)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:488)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:640)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:159)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:158)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:140)
at org.apache.spark.scheduler.Task.run(Task.scala:113)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:528)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1526)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:534)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2360)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2348)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2347)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2347)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1101)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1101)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1101)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2579)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2527)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2515)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:896)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2280)
at org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:270)
at org.apache.spark.sql.execution.collect.Collector.collect(Collector.scala:280)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:80)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:86)
at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:508)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:55)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectResult(Dataset.scala:2889)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3501)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2618)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2618)
at org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3485)
at org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3480)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:111)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:240)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:97)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:170)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3480)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2618)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2832)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:265)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:302)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 480, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 472, in process
serializer.dump_stream(out_iter, outfile)
File "/databricks/spark/python/pyspark/serializers.py", line 456, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/databricks/spark/python/pyspark/serializers.py", line 149, in dump_stream
for obj in iterator:
File "/databricks/spark/python/pyspark/serializers.py", line 445, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/databricks/spark/python/pyspark/worker.py", line 87, in <lambda>
return lambda *a: f(*a)
File "/databricks/spark/python/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<command-256375544159470>", line 3, in detectlang
File "/databricks/python/lib/python3.7/site-packages/textblob/blob.py", line 568, in detect_language
return self.translator.detect(self.raw)
File "/databricks/python/lib/python3.7/site-packages/textblob/translate.py", line 69, in detect
raise TranslatorError('Must provide a string with at least 3 characters.')
textblob.exceptions.TranslatorError: Must provide a string with at least 3 characters.
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:534)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:488)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:640)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:159)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:158)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:140)
at org.apache.spark.scheduler.Task.run(Task.scala:113)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:528)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1526)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:534)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Does anybody know how to resolve this or know of any pre-implemented PySpark functions for my goal?
Based on the latest edit, here is the source of your task failure.
File "/databricks/python/lib/python3.7/site-packages/textblob/translate.py", line 69, in detect
raise TranslatorError('Must provide a string with at least 3 characters.')
textblob.exceptions.TranslatorError: Must provide a string with at least 3 characters.
Spark can sometimes be quite unhelpful in providing errors, so here you need to search all the way for your python exception, as the driver will only say something like Task Lost or Task Failed.
A quick fix to your function is then checking on the len() of the input string, or adding a f.when().otherwise() in pyspark on your function call - this might be more desirable as you will not execute your udf when not needed.
Hope this helps!

Mysterious 'pyarrow.lib.ArrowInvalid: Floating point value truncated' ERROR when use toPandas() on a DataFrame in pyspark

I use toPandas() on a DataFrame which is not very large, but I get the following exception:
18/10/31 19:13:19 ERROR Executor: Exception in task 127.2 in stage 13.0 (TID 2264)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/hadoop/spark2.3.1/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
process()
File "/home/hadoop/spark2.3.1/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/hadoop/spark2.3.1/python/lib/pyspark.zip/pyspark/serializers.py", line 261, in dump_stream
batch = _create_batch(series, self._timezone)
File "/home/hadoop/spark2.3.1/python/lib/pyspark.zip/pyspark/serializers.py", line 239, in _create_batch
arrs = [create_array(s, t) for s, t in series]
File "/home/hadoop/spark2.3.1/python/lib/pyspark.zip/pyspark/serializers.py", line 239, in <listcomp>
arrs = [create_array(s, t) for s, t in series]
File "/home/hadoop/spark2.3.1/python/lib/pyspark.zip/pyspark/serializers.py", line 237, in create_array
return pa.Array.from_pandas(s, mask=mask, type=t)
File "pyarrow/array.pxi", line 474, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 169, in pyarrow.lib.array
File "pyarrow/array.pxi", line 69, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 81, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Floating point value truncated
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:171)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:121)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage19.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage19.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Sometimes, this exception can be ignored and I can get the right result, but more often, the program exited. Does anyone know about this mysterious error?
I came across the same error. I think #bryanc is right that you need to safely cast the type. In my case the data is in bigint while the function needs float/double. So I did
from pyspark.sql.types import DoubleType
df = df.withColumn("x_dbl", df["x"].cast(DoubleType()))
following the approach in how to change a Dataframe column from String type to Double type in pyspark
Then instead of applying the function on "x", I did on "x_dbl" and it worked. Hope this helps!
What version of pyarrow are you using? I believe starting with version 0.11.0, unsafe type casts will raise an error.

Resources