Pandas UDF for pyspark - Package not found error - apache-spark

I am using the pandas UDF approach to scale my models. However, I am getting an error with the pmdarima package not found. The code works fine till I run it on my notebook on the pandas dataframe itself. So the package is available for use in the notebook. From few answers online, the error seems in package not being available on the worker nodes where the code is trying to parallelize. Can someone help on how to resolve this? How can I also install the package on my worker nodes, if that's the case.
FYI - I am working on Azure Databricks.
def funct1(grp_keys, df):
other statements
model = pm.auto_arima(train_data['sum_hlqty'],X=x,
test='adf',trace=False,
maxiter = 12,max_p=5,max_q=5,
njobs=-1)
forecast_df = sales.groupby('Col1','Col2').applyInPandas(funct1,schema="C1 string, C2 string, C3 date, C4 float, C5 float")
Py4JJavaError: An error occurred while calling o256.sql.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:230)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.$anonfun$writeFiles$5(TransactionalWriteEdge.scala:183)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:249)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:199)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.$anonfun$writeFiles$1(TransactionalWriteEdge.scala:135)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$4(UsageLogging.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.spark.util.PublicDBLogging.withAttributionContext(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.spark.util.PublicDBLogging.withAttributionTags(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:412)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:338)
at com.databricks.spark.util.PublicDBLogging.recordOperation(DatabricksSparkUsageLogger.scala:19)
at com.databricks.spark.util.PublicDBLogging.recordOperation0(DatabricksSparkUsageLogger.scala:56)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:129)
at com.databricks.spark.util.UsageLogger.recordOperation(UsageLogger.scala:71)
at com.databricks.spark.util.UsageLogger.recordOperation$(UsageLogger.scala:58)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:85)
at com.databricks.spark.util.UsageLogging.recordOperation(UsageLogger.scala:401)
at com.databricks.spark.util.UsageLogging.recordOperation$(UsageLogger.scala:380)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.recordOperation(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:108)
at com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:94)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.recordDeltaOperation(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.writeFiles(TransactionalWriteEdge.scala:92)
at com.databricks.sql.transaction.tahoe.files.TransactionalWriteEdge.writeFiles$(TransactionalWriteEdge.scala:88)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:112)
at com.databricks.sql.transaction.tahoe.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:111)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:84)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.write(WriteIntoDelta.scala:112)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.$anonfun$run$2(WriteIntoDelta.scala:71)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.$anonfun$run$2$adapted(WriteIntoDelta.scala:70)
at com.databricks.sql.transaction.tahoe.DeltaLog.withNewTransaction(DeltaLog.scala:203)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.$anonfun$run$1(WriteIntoDelta.scala:70)
at com.databricks.sql.acl.CheckPermissions$.trusted(CheckPermissions.scala:1128)
at com.databricks.sql.transaction.tahoe.commands.WriteIntoDelta.run(WriteIntoDelta.scala:69)
at com.databricks.sql.transaction.tahoe.catalog.WriteIntoDeltaBuilder$$anon$1.insert(DeltaTableV2.scala:193)
at org.apache.spark.sql.execution.datasources.v2.SupportsV1Write.writeWithV1(V1FallbackWriters.scala:118)
at org.apache.spark.sql.execution.datasources.v2.SupportsV1Write.writeWithV1$(V1FallbackWriters.scala:116)
at org.apache.spark.sql.execution.datasources.v2.AppendDataExecV1.writeWithV1(V1FallbackWriters.scala:38)
at org.apache.spark.sql.execution.datasources.v2.AppendDataExecV1.run(V1FallbackWriters.scala:44)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:45)
at org.apache.spark.sql.Dataset.$anonfun$logicalPlan$1(Dataset.scala:234)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3709)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:249)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:199)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3707)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:234)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:101)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:680)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:675)
at sun.reflect.GeneratedMethodAccessor655.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 98 in stage 7774.0 failed 4 times, most recent failure: Lost task 98.3 in stage 7774.0 (TID 177293, 10.240.138.10, executor 133): org.apache.spark.api.python.PythonException: 'pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 177, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 466, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
**ModuleNotFoundError: No module named 'pmdarima''** Full traceback below:
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 177, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 466, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
ModuleNotFoundError: No module named 'pmdarima'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 638, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/databricks/spark/python/pyspark/worker.py", line 438, in read_udfs
arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0)
File "/databricks/spark/python/pyspark/worker.py", line 255, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/databricks/spark/python/pyspark/worker.py", line 75, in read_command
command = serializer._read_with_length(file)
File "/databricks/spark/python/pyspark/serializers.py", line 180, in _read_with_length
raise SerializationError("Caused by " + traceback.format_exc())
pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 177, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 466, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
**ModuleNotFoundError: No module named 'pmdarima'****strong text**

Related

Streaming flink job on dataproc throws gRPC error from the worker

My streaming Flink job (from Pub/Sub source) throws multiple error messages from the worker:
Traceback (most recent call last):
File "test.py", line 175, in <module>
run(
File "test.py", line 139, in run
pub_sub_data = ( pipeline | "Read from Pub/Sub" >> pubsub.ReadFromPubSub(topic=input_topic))
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/transforms/ptransform.py", line 1090, in __ror__
return self.transform.__ror__(pvalueish, self.label)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/transforms/ptransform.py", line 614, in __ror__
result = p.apply(self, pvalueish, label)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/pipeline.py", line 662, in apply
return self.apply(transform, pvalueish)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/pipeline.py", line 708, in apply
pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/runners/runner.py", line 185, in apply
return m(transform, input, options)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/runners/runner.py", line 215, in apply_PTransform
return transform.expand(input)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/io/external/gcp/pubsub.py", line 98, in expand
pcoll = pbegin.apply(
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/pvalue.py", line 134, in apply
return self.pipeline.apply(*arglist, **kwargs)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/pipeline.py", line 708, in apply
pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/runners/runner.py", line 185, in apply
return m(transform, input, options)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/runners/runner.py", line 215, in apply_PTransform
return transform.expand(input)
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/transforms/external.py", line 473, in expand
response = service.Expand(request)
File "/opt/conda/default/lib/python3.8/site-packages/grpc/_channel.py", line 946, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/opt/conda/default/lib/python3.8/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.UNAVAILABLE
details = "failed to connect to all addresses"
debug_error_string = "{"created":"#1651418111.458421765","description":"Failed to pick subchannel","file":"src/core/ext/filters/client_channel/client_channel.cc","file_line":3128,"referenced_errors":[{"created":"#1651418111.458419596","description":"failed to connect to all addresses","file":"src/core/lib/transport/error_utils.cc","file_line":163,"grpc_status":14}]}"
>
I am using apache_beam.io.external.gcp.pubsub.ReadFromPubSub function for reading the pub sub topic
python 3.8
apache beam gcp 2.34.0
Flink 1.12
Code:
pipeline_options = PipelineOptions(
pipeline_args, streaming=True, checkpointing_interval=1000, save_main_session=True
)
with Pipeline(options=pipeline_options) as pipeline:
pub_sub_data = ( pipeline | "Read from Pub/Sub" >> pubsub.ReadFromPubSub(topic=input_topic))
I did try apache_beam.io.ReadFromPubSub for reading from pub sub topic and below is the error I get
DEBUG:root:java.lang.IllegalArgumentException: PCollectionNodes [PCollectionNode{id=ref_PCollection_PCollection_1, PCollection=unique_name: "22Read from Pub/Sub/Read.None"
coder_id: "ref_Coder_BytesCoder_1"
is_bounded: UNBOUNDED
windowing_strategy_id: "ref_Windowing_Windowing_1"
}] were consumed but never produced
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument(Preconditions.java:440)
at org.apache.beam.runners.core.construction.graph.QueryablePipeline.buildNetwork(QueryablePipeline.java:234)
at org.apache.beam.runners.core.construction.graph.QueryablePipeline.<init>(QueryablePipeline.java:127)
at org.apache.beam.runners.core.construction.graph.QueryablePipeline.forPrimitivesIn(QueryablePipeline.java:90)
at org.apache.beam.runners.core.construction.graph.GreedyPipelineFuser.<init>(GreedyPipelineFuser.java:70)
at org.apache.beam.runners.core.construction.graph.GreedyPipelineFuser.fuse(GreedyPipelineFuser.java:93)
at org.apache.beam.runners.flink.FlinkPipelineRunner.runPipelineWithTranslator(FlinkPipelineRunner.java:112)
at org.apache.beam.runners.flink.FlinkPipelineRunner.run(FlinkPipelineRunner.java:85)
at org.apache.beam.runners.jobsubmission.JobInvocation.runPipeline(JobInvocation.java:86)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:57)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:750)
ERROR:root:java.lang.IllegalArgumentException: PCollectionNodes [PCollectionNode{id=ref_PCollection_PCollection_1, PCollection=unique_name: "22Read from Pub/Sub/Read.None"
coder_id: "ref_Coder_BytesCoder_1"
is_bounded: UNBOUNDED
windowing_strategy_id: "ref_Windowing_Windowing_1"
}] were consumed but never produced
INFO:apache_beam.runners.portability.portable_runner:Job state changed to FAILED
Traceback (most recent call last):
File "test.py", line 175, in <module>
run(
File "test.py", line 144, in run
_ = main_error | "Transformation Errors to GCS" >> ParDo(WriteToGCS(output_path))
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/pipeline.py", line 597, in __exit__
self.result.wait_until_finish()
File "/home/ravi/.local/lib/python3.8/site-packages/apache_beam/runners/portability/portable_runner.py", line 600, in wait_until_finish
raise self._runtime_exception
RuntimeError: Pipeline BeamApp-ravi-0501153516-4f843e9f_2e7c1bb8-7ac7-4adc-a8f4-fa9f0f97b770 failed in state FAILED: java.lang.IllegalArgumentException: PCollectionNodes [PCollectionNode{id=ref_PCollection_PCollection_1, PCollection=unique_name: "22Read from Pub/Sub/Read.None"
coder_id: "ref_Coder_BytesCoder_1"
is_bounded: UNBOUNDED
windowing_strategy_id: "ref_Windowing_Windowing_1"
}] were consumed but never produced
DEBUG:root:Sending SIGINT to job_server

java.io.IOException: No FileSystem for scheme: C and WinError 10054: An existing connection was forcibly closed by the remote host

I was trying to Connect and Fetch data from BigQuery Dataset to Local Pycharm Using Pyspark.
I ran this below Script in Pycharm:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.config('spark.jars', "C:/Users/PycharmProjects/pythonProject/spark-bigquery-latest.jar")\
.getOrCreate()
conn = spark.read.format("bigquery")\
.option("credentialsFile", "C:/Users/PycharmProjects/pythonProject/google-bq-api.json")\
.option("parentProject", "Google-Project-ID")\
.option("project", "Dataset-Name")\
.option("table", "dataset.schema.tablename")\
.load()
conn.show()
For this I got the below error:
Exception in thread "main" java.io.IOException: No FileSystem for scheme: C
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.spark.deploy.DependencyUtils$.resolveGlobPath(DependencyUtils.scala:191)
at org.apache.spark.deploy.DependencyUtils$.$anonfun$resolveGlobPaths$2(DependencyUtils.scala:147)
at org.apache.spark.deploy.DependencyUtils$.$anonfun$resolveGlobPaths$2$adapted(DependencyUtils.scala:145)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:108)
at org.apache.spark.deploy.DependencyUtils$.resolveGlobPaths(DependencyUtils.scala:145)
at org.apache.spark.deploy.SparkSubmit.$anonfun$prepareSubmitEnvironment$4(SparkSubmit.scala:363)
at scala.Option.map(Option.scala:230)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:363)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:871)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1007)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1016)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Traceback (most recent call last):
File "C:\Users\naveen.chandar\PycharmProjects\pythonProject\BigQueryConnector.py", line 4, in <module>
spark = SparkSession.builder.config('spark.jars', 'C:/Users/naveen.chandar/PycharmProjects/pythonProject/spark-bigquery-latest.jar').getOrCreate()
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\sql\session.py", line 186, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 376, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 133, in __init__
SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 325, in _ensure_initialized
SparkContext._gateway = gateway or launch_gateway(conf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\java_gateway.py", line 105, in launch_gateway
raise Exception("Java gateway process exited before sending its port number")
Exception: Java gateway process exited before sending its port number
So, I researched and tried it from different Diecrtory like "D-drive" and also tried to fix a static port with set PYSPARK_SUBMIT_ARGS="--master spark://<IP_Address>:<Port>", but still I got the same error in Pycharm.
Then I thought of trying the same script in local Command Prompt under Pyspark and I got this error:
failed to find class org/conscrypt/CryptoUpcalls
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1152, in send_command
answer = smart_decode(self.stream.readline()[:-1])
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python37\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 985, in send_command
response = connection.send_command(command)
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1164, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\dataframe.py", line 381, in show
print(self._jdf.showString(n, 20, vertical))
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
File "D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 336, in get_return_value
py4j.protocol.Py4JError: An error occurred while calling o42.showString
My Python Version is 3.7.9 and Spark Version is 2.4.7
So either way I ran out of idea's and I appreciate some help on any one of the situation I facing...
Thanks In Advance!!
Start your file system references with file:///c:/...
You need to replace / with \ for the path to work

Getting errors parsing text using Spacy using PySpark and Jupyter

I am trying to parse some text using spacy to get word dependencies. I am running PySpark in Anaconda with Jupyter notebooks.
Python version: 3.7.5
PySpark version: 2.4.4
Spacy version: 2.2.5
Anaconda version: 4.7.12
Jupyter version: 6.0.2
Here's a MVCE for the error:
import spacy
import en_core_web_sm
from pyspark.sql.functions import *
from pyspark.sql.types import *
def get_token_dep(text):
if text:
nlp = en_core_web_sm.load()
return [(token.text, token.tag_, token.head.text, token.dep_) for token in nlp(text)]
else:
return [['N/A']]
get_token_dep_udf = udf(get_token_dep, ArrayType(ArrayType(StringType())))
text_list = ['Chocolate is a food made from cacao beans.', 'Dessert is a course that concludes a meal.']
text_df = spark.createDataFrame(text_list, StringType())
text_df = text_df.withColumnRenamed(
'value', 'text'
).withColumn(
'parsed_text', get_token_dep_udf('text')
)
display(text_df.toPandas())
However, I am getting errors as follows:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-14-bc4e37a4051a> in <module>
----> 1 display(text_df.toPandas())
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
2141
2142 # Below is toPandas without Arrow optimization.
-> 2143 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
2144
2145 dtype = {}
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
532 """
533 with SCCallSiteSync(self._sc) as css:
--> 534 sock_info = self._jdf.collectToPython()
535 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
536
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o147.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 11.0 failed 1 times, most recent failure: Lost task 7.0 in stage 11.0 (TID 47, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 8, in <module>
import importlib.metadata as importlib_metadata
ModuleNotFoundError: No module named 'importlib.metadata'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 366, in main
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 241, in read_udfs
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 168, in read_single_udf
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 580, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\en_core_web_sm\__init__.py", line 5, in <module>
from spacy.util import load_model_from_init_py, get_model_meta
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\__init__.py", line 12, in <module>
from . import pipeline
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\pipeline\__init__.py", line 4, in <module>
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
File "pipes.pyx", line 1, in init spacy.pipeline.pipes
File "strings.pxd", line 23, in init spacy.syntax.nn_parser
File "strings.pyx", line 17, in init spacy.strings
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\util.py", line 16, in <module>
import catalogue
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 10, in <module>
import importlib_metadata
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\\C:\\Users\\user1\\AppData\\Local\\Continuum\\anaconda3\\envs\\py37\\Lib\\site-packages\\pyspark\\jars\\spark-core_2.11-2.4.4.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 8, in <module>
import importlib.metadata as importlib_metadata
ModuleNotFoundError: No module named 'importlib.metadata'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 366, in main
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 241, in read_udfs
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 168, in read_single_udf
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 580, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\en_core_web_sm\__init__.py", line 5, in <module>
from spacy.util import load_model_from_init_py, get_model_meta
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\__init__.py", line 12, in <module>
from . import pipeline
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\pipeline\__init__.py", line 4, in <module>
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
File "pipes.pyx", line 1, in init spacy.pipeline.pipes
File "strings.pxd", line 23, in init spacy.syntax.nn_parser
File "strings.pyx", line 17, in init spacy.strings
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\spacy\util.py", line 16, in <module>
import catalogue
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\catalogue.py", line 10, in <module>
import importlib_metadata
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\user1\AppData\Local\Continuum\anaconda3\envs\py37\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\\C:\\Users\\user1\\AppData\\Local\\Continuum\\anaconda3\\envs\\py37\\Lib\\site-packages\\pyspark\\jars\\spark-core_2.11-2.4.4.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
I have tried upgrading Python to 3.8 but Jupyter notebooks doesn't support the newer Python version yet. Anyone able to get spacy to work with PySpark on Jupyter notebooks?
Part of the error points to https://github.com/explosion/catalogue/blob/master/catalogue.py#L7, where the import of importlib.metadata seems to go wrong, but not with the expected error type ImportError. I'll make a PR to include the ModuleNotFoundError and let's hope that would fix the issue!
[EDIT:] Hm, ModuleNotFoundError is a subclass of ImportError so I don't understand why that is not properly caught in the except block :|
[EDIT 2:] Logged an issue https://github.com/explosion/catalogue/issues/4 in case this is indeed related to catalogue.py

Serialization error with Spark Pandas_UDF

I have a python function that I converted to Pandas_UDF function and it worked fine up until last week but getting the below error from the last few days. We tried a simple python function with Pandas UDF and it is not throwing this error. I am not sure what exactly in my code is causing this. Has there been any change to spark environment. I am using Azure Databricks if it helps.
Search only turned up the this link but it is old.
Appreciate any pointers on how to fix this issue.
Thanks,
Yudi
SparkException: Job aborted due to stage failure: Task 0 in stage 23.0 failed 4 times, most recent failure: Lost task 0.3 in stage 23.0 (TID 252, 172.17.69.7, executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 180, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 669, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 875, in subimport
import(name)
ImportError: No module named '_pandasujson'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 394, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/databricks/spark/python/pyspark/worker.py", line 234, in read_udfs
arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf)
File "/databricks/spark/python/pyspark/worker.py", line 160, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/databricks/spark/python/pyspark/worker.py", line 69, in read_command
command = serializer._read_with_length(file)
File "/databricks/spark/python/pyspark/serializers.py", line 183, in _read_with_length
raise SerializationError("Caused by " + traceback.format_exc())
pyspark.serializers.SerializationError: Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 180, in _read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 669, in loads
return pickle.loads(obj, encoding=encoding)
File "/databricks/spark/python/pyspark/cloudpickle.py", line 875, in subimport
import(name)
ImportError: No module named '_pandasujson'

Add date field to RDD in Spark

I have a pretty simple RDD called STjoin on which I pass a simple function to get the day out of a string representing the date-time.
The code passes lazy evaluation, but if I run the last line (STjoinday.take(5)), I get an error.
def parsedate(x):
try:
dt=dateutil.parser.parse(x[1]).date()
except:
dt=dateutil.parser.parse("01 Jan 1900 00:00:00").date()
x.append(dt)
return x
STjoinday=STjoin.map(lambda line: parsedate(line))
#STjoinday.take(5)
What is the problem here?
Long error traceback below:
15/04/27 22:14:02 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 8)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/worker.py", line 79, in main
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 196, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 127, in dump_stream
for obj in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 185, in _batched
for item in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/rdd.py", line 1147, in takeUpToNumLeft
yield next(iterator)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/test3.py", line 72, in parsedate
dt=dateutil.parser.parse("01 Jan 1900 00:00:00").date()
AttributeError: 'module' object has no attribute 'parser'
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:124)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:154)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:87)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
at org.apache.spark.scheduler.Task.run(Task.scala:54)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:178)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
15/04/27 22:14:02 ERROR TaskSetManager: Task 0 in stage 6.0 failed 1 times; aborting job
Traceback (most recent call last):
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/test3.py", line 79, in <module>
STjoinday.take(5)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/rdd.py", line 1152, in take
res = self.context.runJob(self, takeUpToNumLeft, p, True)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/context.py", line 770, in runJob
it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 8, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/worker.py", line 79, in main
serializer.dump_stream(func(split_index, iterator), outfile)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 196, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 127, in dump_stream
for obj in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/serializers.py", line 185, in _batched
for item in iterator:
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/python/pyspark/rdd.py", line 1147, in takeUpToNumLeft
yield next(iterator)
File "/home/terrapin/Spark_Hadoop/spark-1.1.1-bin-cdh4/test3.py", line 72, in parsedate
dt=dateutil.parser.parse("01 Jan 1900 00:00:00").date()
AttributeError: 'module' object has no attribute 'parser'
org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:124)
org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:154)
org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:87)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:62)
org.apache.spark.scheduler.Task.run(Task.scala:54)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:178)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1185)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1174)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1173)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1173)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:688)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:688)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1391)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
As pointed out in other answers and comments, the problem is with the importation of dateutils. I found a way that works, even though I am not sure why the others fail. Instead of the above:
from dateutil.parser import parse as parse_date
then use:
dt=parse_date("01 Jan 1900 00:00:00").date()
Looks like dateutil is not a standard python pkg. You need to distribute it to every worker node.
Can you post what happens when you just import dateutil after running python shell? May be you are missing some entry in PYTHONPATH

Resources