I am running spark 2.0 and zeppelin-0.6.1-bin-all on a Linux server. The default spark notebook runs just fine, but when I try to create and run a new notebook in pyspark using sqlContext I get the error "py4j.Py4JException: Method createDataFrame([class java.util.ArrayList, class java.util.ArrayList, null]) does not exist"
I tried running a simple code,
%pyspark
wordsDF = sqlContext.createDataFrame([('cat',), ('elephant',), ('rat',), ('rat',), ('cat', )], ['word'])
wordsDF.show()
print type(wordsDF)
wordsDF.printSchema()
I get the error,
Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-7635635698598314374.py", line 266, in
raise Exception(traceback.format_exc())
Exception: Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-7635635698598314374.py", line 259, in
exec(code)
File "", line 1, in
File "/spark/spark-2.0.0-bin-hadoop2.7/python/pyspark/sql/context.py", line 299, in createDataFrame
return self.sparkSession.createDataFrame(data, schema, samplingRatio)
File "/spark/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py", line 933, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/spark/spark-2.0.0-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/spark/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py", line 316, in get_return_value
format(target_id, ".", name, value))
Py4JError: An error occurred while calling o48.createDataFrame. Trace:
py4j.Py4JException: Method createDataFrame([class java.util.ArrayList, class java.util.ArrayList, null]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
When I try the same code with "sqlContext = SQLContext(sc)" it works just fine.
I have tried setting the interpreter "zeppelin.spark.useHiveContext false" configuration but it did not work.
I must obviously be missing something since this is such a simple operation. Please advice if there is any other configuration to be set or what I am missing.
I tested the same piece of code with Zeppelin 0.6.0 and it is working fine.
SparkSession is the default entry-point for Spark 2.0.0, which is mapped to spark in Zeppelin 0.6.1 (as it is in the Spark shell). Have you tried spark.createDataFrame(...)?
Related
I'm trying to run pyspark readStream/writeStream using python multiprocessing as a databricks job. Not directly from the notebook but saving the script in .py file inside dbfs:. I can run the readStream/writeStream from notebook but when I try to do the same from the file it throws Method writeStream([]) does not exist
Code
import multiprocessing
import time
from pyspark.sql import SparkSession
def _get_c3_value(df, epochId, v):
print('inside get_c3')
A = df.collect()[0][0]
v.value = A
print(A)
def _readStream_c3(v):
spark = SparkSession.builder.getOrCreate()
df = (spark.readStream.format('delta').option('readChangeFeed','true').option('startingVersion','latest').option('_change_type', 'insert').load("dbfs:/mnt/path/to/file").writeStream.foreachBatch(lambda df, epochId: _get_c3_value(df, epochId,v)).outputMode('append').start())
df.awaitTermination()
if __name__ == '__main__':
spark = SparkSession.builder.appName('multi-processing'
).master('spark://1.2.3.4:8077').getOrCreate()
for n in spark.sparkContext.getConf().getAll():
print (n)
v = multiprocessing.Value('d', 0.0)
p1 = multiprocessing.Process(target=_readStream_c3, args=(v, ))
p1.start()
p1.join()
StdOut
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/tmp/tmp3syog799.py", line 22, in _readStream_c3
df = (spark.readStream.format('delta').option('readChangeFeed','true').option('startingVersion','latest').option('_change_type', 'insert').load("dbfs:/mnt/path/to/file").writeStream.foreachBatch(lambda df, epochId: _get_c3_value(df, epochId)).outputMode('append').start())
File "/databricks/spark/python/pyspark/sql/dataframe.py", line 272, in writeStream
return DataStreamWriter(self)
File "/databricks/spark/python/pyspark/sql/streaming.py", line 755, in __init__
self._jwrite = df._jdf.writeStream()
File "/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "/databricks/spark/python/pyspark/sql/utils.py", line 117, in deco
return f(*a, **kw)
File "/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py", line 330, in
get_return_value
raise Py4JError(
py4j.protocol.Py4JError: An error occurred while calling o1311.writeStream. Trace:
py4j.Py4JException: Method writeStream([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:341)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:349)
at py4j.Gateway.invoke(Gateway.java:286)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
StdErr
ERROR: Query termination received for [id=0bf6163e-797c-4918-a057-8a90fa1f9d2d, runId=679fb0b5-ec0c-456e-939b-c0ba4c34325f], with exception: py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Object ID unknown
at py4j.Protocol.getReturnValue(Protocol.java:476)
at py4j.reflection.PythonProxyHandler.invoke(PythonProxyHandler.java:108)
at com.sun.proxy.$Proxy102.call(Unknown Source)
at org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchHelper$.$anonfun$callForeachBatch$1(ForeachBatchSink.scala:208)
at org.apache.spark.sql.execution.streaming.sources.PythonForeachBatchHelper$.$anonfun$callForeachBatch$1$adapted(ForeachBatchSink.scala:208)
at org.apache.spark.sql.execution.streaming.sources.ForeachBatchSink.addBatchLegacy(ForeachBatchSink.scala:89)
at org.apache.spark.sql.execution.streaming.sources.ForeachBatchSink.$anonfun$addBatch$1(ForeachBatchSink.scala:68)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:680)
at org.apache.spark.sql.execution.streaming.sources.ForeachBatchSink.addBatch(ForeachBatchSink.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:719)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$8(SQLExecution.scala:239)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:386)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:186)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:968)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:141)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:336)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:717)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:301)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:299)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:73)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:717)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$5(MicroBatchExecution.scala:283)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.withSchemaEvolution(MicroBatchExecution.scala:816)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:280)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:301)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:299)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:73)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:239)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:67)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:233)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:359)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:968)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:323)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:250)
What am I doing wrong ?. I can run the streaming part fine in notebook but not as a python script.
Thanks in advance!
I'm new to pyspark. I'm running pyspark in the local machine. I'm trying write to CSV file from pyspark data frame. So I wrote the following code
dataframe.write.mode('append').csv(outputPath)
But I'm getting an error message
Traceback (most recent call last):
File "D:\PycharmProjects\pythonProject\org\spark\weblog\SparkWebLogsAnalysis.py", line 71, in <module>
weblog_sessionIds.write.mode('append').csv(outputPath)
File "C:\spark-3.1.2-bin-hadoop3.2\python\pyspark\sql\readwriter.py", line 1372, in csv
self._jwrite.csv(path)
File "C:\spark-3.1.2-bin-hadoop3.2\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in __call__
File "C:\spark-3.1.2-bin-hadoop3.2\python\pyspark\sql\utils.py", line 111, in deco
return f(*a, **kw)
File "C:\spark-3.1.2-bin-hadoop3.2\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o49.csv.
: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.createDirectoryWithMode0(Ljava/lang/String;I)V
at org.apache.hadoop.io.nativeio.NativeIO$Windows.createDirectoryWithMode0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.createDirectoryWithMode(NativeIO.java:560)
at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:534)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:587)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:559)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:586)
at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:559)
at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:705)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:354)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:178)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:173)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
Can you suggest me to rectify this error?
Problem got resolve by deleting hadoop.dll file from winutils folder and using lower version of Spark
Running a glueetl, GlueVersion 2.0, python 3, AWS Glue job I am trying to do the recommended python3 lazy loading log messages with
logger.info("Attempting to run python module 1 {entrypoint}".format(entrypoint=entrypoint))
logger.info("Attempting to run python module 2 %s", entrypoint)
This produces an error on the second line but the first line succeeds and prints the string.
2021-06-25 04:12:58,782 INFO [Thread-7] log.GlueLogger (GlueLogger.scala:info(8)): Attempting to run python module 1 main.test_program
2021-06-25 04:12:58,818 ERROR [main] glue.ProcessLauncher (Logging.scala:logError(70)): Error from Python:Traceback (most recent call last):
File "/tmp/main_etl_script.py", line 103, in <module>
spark=spark)
File "/tmp/main_etl_script.py", line 32, in main_handler
logger.info("Attempting to run python module 2 %s", entrypoint)
File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 332, in get_return_value
format(target_id, ".", name, value))
py4j.protocol.Py4JError: An error occurred while calling o85.info. Trace:
py4j.Py4JException: Method info([class java.lang.String, class java.lang.String]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
My logger setup is simple:
import logging
...
glue_context = GlueContext(SparkContext())
logger = glue_context.get_logger()
Why does this occur?
It doesn't like the comma after the 2 %s":
logger.info("Attempting to run python module 2 %s", entrypoint)
try this with a f string:
logger.info(f"Attempting to run python module 2 {entrypoint}")
I was trying to Connect and Fetch data from BigQuery Dataset to Local Pycharm Using Pyspark.
I ran this below Script in Pycharm:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.config('spark.jars', "C:/Users/PycharmProjects/pythonProject/spark-bigquery-latest.jar")\
.getOrCreate()
conn = spark.read.format("bigquery")\
.option("credentialsFile", "C:/Users/PycharmProjects/pythonProject/google-bq-api.json")\
.option("parentProject", "Google-Project-ID")\
.option("project", "Dataset-Name")\
.option("table", "dataset.schema.tablename")\
.load()
conn.show()
For this I got the below error:
Exception in thread "main" java.io.IOException: No FileSystem for scheme: C
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.spark.deploy.DependencyUtils$.resolveGlobPath(DependencyUtils.scala:191)
at org.apache.spark.deploy.DependencyUtils$.$anonfun$resolveGlobPaths$2(DependencyUtils.scala:147)
at org.apache.spark.deploy.DependencyUtils$.$anonfun$resolveGlobPaths$2$adapted(DependencyUtils.scala:145)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:108)
at org.apache.spark.deploy.DependencyUtils$.resolveGlobPaths(DependencyUtils.scala:145)
at org.apache.spark.deploy.SparkSubmit.$anonfun$prepareSubmitEnvironment$4(SparkSubmit.scala:363)
at scala.Option.map(Option.scala:230)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:363)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:871)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1007)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1016)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Traceback (most recent call last):
File "C:\Users\naveen.chandar\PycharmProjects\pythonProject\BigQueryConnector.py", line 4, in <module>
spark = SparkSession.builder.config('spark.jars', 'C:/Users/naveen.chandar/PycharmProjects/pythonProject/spark-bigquery-latest.jar').getOrCreate()
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\sql\session.py", line 186, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 376, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 133, in __init__
SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\context.py", line 325, in _ensure_initialized
SparkContext._gateway = gateway or launch_gateway(conf)
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python39\lib\site-packages\pyspark\java_gateway.py", line 105, in launch_gateway
raise Exception("Java gateway process exited before sending its port number")
Exception: Java gateway process exited before sending its port number
So, I researched and tried it from different Diecrtory like "D-drive" and also tried to fix a static port with set PYSPARK_SUBMIT_ARGS="--master spark://<IP_Address>:<Port>", but still I got the same error in Pycharm.
Then I thought of trying the same script in local Command Prompt under Pyspark and I got this error:
failed to find class org/conscrypt/CryptoUpcalls
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1152, in send_command
answer = smart_decode(self.stream.readline()[:-1])
File "C:\Users\naveen.chandar\AppData\Local\Programs\Python\Python37\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 985, in send_command
response = connection.send_command(command)
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1164, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\dataframe.py", line 381, in show
print(self._jdf.showString(n, 20, vertical))
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
File "D:\spark-2.4.7-bin-hadoop2.7\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "D:\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 336, in get_return_value
py4j.protocol.Py4JError: An error occurred while calling o42.showString
My Python Version is 3.7.9 and Spark Version is 2.4.7
So either way I ran out of idea's and I appreciate some help on any one of the situation I facing...
Thanks In Advance!!
Start your file system references with file:///c:/...
You need to replace / with \ for the path to work
I'm trying to stream data from AWS Kinesis using Spark Streaming + Kinesis Integration
My code looks like:
sc = SparkContext('local[*]', 'app_name')
ssc = StreamingContext(sc, 10)
kinesisStream = KinesisUtils.createStream(ssc,
kinesisAppName='kinesis_app_name',
streamName='kinesis_stream_name',
endpointUrl='https://kinesis.ap-southeast-2.amazonaws.com',
regionName='ap-southeast-2',
initialPositionInStream=InitialPositionInStream.TRIM_HORIZON,
checkpointInterval=10)
The command to run the script: spark-submit --packages org.apache.spark:spark-streaming-kinesis-asl_2.11:2.2.0 script.py. I'm using Spark 2.2.0 with Pyspark.
The error I got:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/home/ubuntu/transformer/env/lib/python3.5/site-packages/py4j/java_gateway.py", line 1035, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/ubuntu/transformer/env/lib/python3.5/site-packages/py4j/java_gateway.py", line 883, in send_command
response = connection.send_command(command)
File "/home/ubuntu/transformer/env/lib/python3.5/site-packages/py4j/java_gateway.py", line 1040, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
File "kinesis_to_s3.py", line 63, in
checkpointInterval=streaming_interval)
File "/home/ubuntu/transformer/env/lib/python3.5/site-packages/pyspark/streaming/kinesis.py", line 92, in createStream
stsSessionName, stsExternalId)
File "/home/ubuntu/transformer/env/lib/python3.5/site-packages/py4j/java_gateway.py", line 1133, in call
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/transformer/env/lib/python3.5/site-packages/py4j/protocol.py", line 327, in get_return_value
format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o27.createStream
Exception in thread "Thread-2" java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/clientlibrary/lib/worker/InitialPositionInStream
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.privateGetPublicMethods(Class.java:2902)
at java.lang.Class.getMethods(Class.java:1615)
at py4j.reflection.ReflectionEngine.getMethodsByNameAndLength(ReflectionEngine.java:345)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:305)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 12 more