PySpark using s3a throws java.lang.IllegalArgumentException - apache-spark

I've a bunch of code that works perfectly with s3n, but when i try to switch to s3a I just get some sort of java.lang.IllegalArgumentException without a real pointer or hint as to what exactly is wrong.. would appreciate some suggestions for debugging! I'm on hadoop-aws-2.7.3 and aws-java-sdk-1.7.4 so I believe that should be fine
error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-2-1aafd157ea37> in <module>
----> 1 schema_df = spark.read.json('s3a://udemy-stream-logs/cdn-access-raw/verizon/mp4-a.udemycdn.com/wpc_C9216_306_20200701_0C390000BFD7B55E_100.json_lines.gz')
2 schema = schema_df.schema
/usr/local/spark/python/pyspark/sql/readwriter.py in json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding, locale, pathGlobFilter, recursiveFileLookup)
298 path = [path]
299 if type(path) == list:
--> 300 return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
301 elif isinstance(path, RDD):
302 def func(iterator):
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o31.json.
: java.lang.IllegalArgumentException
at java.base/java.util.concurrent.ThreadPoolExecutor.<init>(ThreadPoolExecutor.java:1293)
at java.base/java.util.concurrent.ThreadPoolExecutor.<init>(ThreadPoolExecutor.java:1215)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:280)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:477)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
my code:
conf = (SparkConf()
.set('spark.executor.extraJavaOptions', '-Dcom.amazonaws.services.s3.enableV4=true')
.set('spark.driver.extraJavaOptions', '-Dcom.amazonaws.services.s3.enableV4=true')
.set('spark.master', 'local[*]')
.set('spark.driver.memory', '4g'))
scT = SparkContext(conf=conf)
scT.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')
scT.setLogLevel("INFO")
hadoopConf = scT._jsc.hadoopConfiguration()
hadoopConf.set('fs.s3.buffer.dir', '/tmp/pyspark')
hadoopConf.set('fs.s3a.awsAccessKeyId', 'key')
hadoopConf.set('fs.s3a.awsSecretAccessKey', 'secret')
hadoopConf.set('fs.s3a.endpoint', 's3-us-east-1.amazonaws.com')
hadoopConf.set('fs.s3a.multipart.size', '104857600')
hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
hadoopConf.set('fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider')
spark = SparkSession(scT)
df = spark.read.json('s3a://mybucket/something_something.json_lines.gz')

Ignoring the little detail that you have set the username and password in the wrong properties for the s3a connector, that stack trace implies its from thread pool construction. Presumably one of the parameters passed in (thread pool size, keep alive time. is somehow invalid. No obvious cue as to which specific option is provided by the JVM though.
My recommendation is to stop copying and pasting other stack overflow examples and look at the s3a documentation. See what the options are for authentication and then for bounded and unbounded thread pools and make sure they're set

I got the same problem as you, and I figure out this was caused by the code you configure "fs.s3a.multipart.size". I removed it and the problem has gone. You could try it.

Related

How to set azure jars for pyspark to be able to read Blob Storage from jupyter?

I am building an application for studying purposes. In this application, I have two docker containers mouted:
azurite (which emulates a Azure Storage container) - mcr.microsoft.com/azure-storage/azurite
a jupyter notebook with pyspark - jupyter/pyspark-notebook
They are already in the same network and the comunication between them is not a problem.
My main problem is that I am trying to make pyspark to read files from Azure Storage with spark.read.json(...) but I can't beacause I`m not getting how to config pyspark jar files.
Below, my try:
spark = SparkSession.builder \
.appName('test') \
.config(
'spark.driver.extraClassPath',
'/home/jovyan/work/normalization/.jars/hadoop-azure-3.3.2.jar, /home/jovyan/work/normalization/.jars/azure-storage-8.6.6.jar') \
.config(
'fs.azure',
'org.apache.hadoop.fs.azure.NativeAzureFileSystem') \
.config(
'fs.azure.account.key.devstoreaccount1.blob.core.windows.net',
'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=='
) \
.getOrCreate()
df = spark.read.json('wasbs://container#devstoreaccount1.blob.core.windows.net/path/to/file.json')
When I try to read the file, I get the following error:
Py4JJavaError Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 df = spark.read.json('wasbs://bronze#devstoreaccount1.blob.core.windows.net/pokemon_tcg/cards/2022/05/01/*.json')
File /usr/local/spark/python/pyspark/sql/readwriter.py:229, in DataFrameReader.json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding, locale, pathGlobFilter, recursiveFileLookup, allowNonNumericNumbers, modifiedBefore, modifiedAfter)
227 path = [path]
228 if type(path) == list:
--> 229 return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
230 elif isinstance(path, RDD):
231 def func(iterator):
File /usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File /usr/local/spark/python/pyspark/sql/utils.py:111, in capture_sql_exception.<locals>.deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
File /usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o40.json.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2667)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$1(DataSource.scala:747)
at scala.collection.immutable.List.map(List.scala:293)
at org.apache.spark.sql.execution.datasources.DataSource$.checkAndGlobPathIfNecessary(DataSource.scala:745)
at org.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessary(DataSource.scala:577)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:408)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:405)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2571)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2665)
... 29 more
What am I doing wrong???

Py4JJavaError: An error occurred while calling o57.sql.: org.apache.spark.SparkException: Job aborted

I'm trying to write spark dataframe to hive table according to below code. But i got an error.
I've checked same issues posts (Py4JJavaError: An error occurred while calling o57.showString. : org.apache.spark.SparkException:) but i couldn't find any solution.
You can find the full error.
CODE:
spark_df = spark.createDataFrame(df2)
spark_df.createOrReplaceTempView("steer");
spark.sql("drop table if exists sandbox_nonmotor.steer")
spark.sql("create table sandbox_nonmotor.steer as select * from steer")
ERROR :
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-16-84bf8c9c8f45> in <module>
2 spark_df.createOrReplaceTempView("steer");
3 spark.sql("drop table if exists sandbox_nonmotor.steer")
----> 4 spark.sql("create table sandbox_nonmotor.steer as select * from steer")
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/sql/session.py in sql(self, sqlQuery)
765 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
766 """
--> 767 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
768
769 #since(2.0)
/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in
__call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/opt/cloudera/parcels/SPARK2/lib/spark2/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/opt/cloudera/parcels/SPARK2/lib/spark2/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o57.sql.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile$class.saveAsHiveFile(SaveAsHiveFile.scala:86)
at
org.apache.spark.sql.hive.execution.InsertIntoHiveTable.saveAsHiveFile(InsertIntoHiveTable.scala:66)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.processInsert
(InsertIntoHiveTable.scala:195)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.run(InsertIntoHiveTable.scala:99)
at org.apache.spark.sql.hive.execution.CreateHiveTableAsSelectCommand.run
(CreateHiveTableAsSelectCommand.scala:88)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute
(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:115)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:194)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:194)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply
(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:194)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:79)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Serialized task 2:0 was
155731289 bytes, which exceeds max allowed: spark.rpc.message.maxSize (134217728 bytes). Consider
increasing spark.rpc.message.maxSize or using broadcast variables for large values.
at org.apache.spark.scheduler.DAGScheduler.
org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply
(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply
(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
... 29 more
The post you've linked has a different issue, in your case error message is:
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Serialized task 2:0 was
155731289 bytes, which exceeds max allowed: spark.rpc.message.maxSize (134217728 bytes). Consider
increasing spark.rpc.message.maxSize or using broadcast variables for large values.
You should try setting bigger spark.rpc.message.maxSize, try something like:
config = SparkConf().set('spark.rpc.message.maxSize', '256')
sc = SparkContext.getOrCreate(conf=config)

Microsoft Azure spark kusto connector -- Is it possible to get files of azure storage out of databricks?

I'm trying to read and write files at azure storage, my attempts until now:
Creating the Spark Session:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
sparkOptions = {"executor_memory" : "1G","driver_memory": "1G", "max_results_size": "1G"}
conf = pyspark.SparkConf().setAppName(app)
conf = (conf.setMaster("local[*]")
.set('spark.executor.memory', sparkOptions["executor_memory"])\
.set('spark.driver.memory', sparkOptions["driver_memory"])\
.set('spark.driver.maxResultSize', sparkOptions["max_results_size"])\
.set('spark.sql.crossJoin.enabled', "true")\
.set('spark.jars.packages', 'com.microsoft.azure.kusto:spark-kusto-connector:1.0.0')\
.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")\
.set("fs.azure.account.auth.type", "OAuth")\
.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")\
.set("fs.azure.account.oauth2.client.id", id)\
.set("fs.azure.account.oauth2.client.secret", secret)\
.set("fs.azure.account.oauth2.client.endpoint", endpoint)\
.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
)
sparkContext = pyspark.SparkContext(conf=conf)
sparkSession = SparkSession(sparkContext)
sqlContext = SQLContext(sparkContext)
Trying to read a CSV in azure storage:
df = sparkSession.read.option("header", "true").csv("wasbs://container#account.blob.core.windows.net/archive.csv")
df.show()
Error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-3-975f978e0f66> in <module>()
----> 1 df = sparkSession.read.option("header", "true").csv("wasbs://container#account.blob.core.windows.net/archive.csv")
2 df.show()
~/anaconda3/lib/python3.6/site-packages/pyspark/sql/readwriter.py in csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue)
474 path = [path]
475 if type(path) == list:
--> 476 return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
477 elif isinstance(path, RDD):
478 def func(iterator):
~/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~/anaconda3/lib/python3.6/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o68.csv.
: java.io.IOException: No FileSystem for scheme: wasbs
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:618)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
Trying with abfss:
df = sparkSession.read.option("header", "true").csv("abfss://container#account.blob.core.windows.net/archive.csv")
df.show()
Error:
y4JJavaError Traceback (most recent call last)
<ipython-input-4-02abec06890e> in <module>()
----> 1 df = sparkSession.read.option("header", "true").csv("abfss://container#account.blob.core.windows.net/archive.csv")
2 df.show()
~/anaconda3/lib/python3.6/site-packages/pyspark/sql/readwriter.py in csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode, columnNameOfCorruptRecord, multiLine, charToEscapeQuoteEscaping, samplingRatio, enforceSchema, emptyValue)
474 path = [path]
475 if type(path) == list:
--> 476 return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
477 elif isinstance(path, RDD):
478 def func(iterator):
~/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~/anaconda3/lib/python3.6/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
~/anaconda3/lib/python3.6/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o104.csv.
: java.io.IOException: No FileSystem for scheme: abfss
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:618)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
Searching for examples of utilization of the kusto-spark connector I've only found examples in databricks, utilizing dbutils, I wanna know if it's possible to use the connector out of databricks and what I'm doing wrong in my code, Thank you.
It's not really about kusto
Are you using Azure databricks? If so simply refer their docs.
If not try importing
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-azure</artifactId>
<version>2.7.0</version>
</dependency>
If it doesn't help - download the connector code from GitHub and change this dependency
to 2.7 (the connector uses 3.2)
Btw, don't know if you did or didn't but you have to set key or sas to this container via the spark conf

Databricks notebook time out error when calling other notebooks: com.databricks.WorkflowException: java.net.SocketTimeoutException: Read timed out

I have a main notebook that call a series of other notebook. Each notebook performs a MERGE on a delta table to update or insert new records on it.
When I ran the main notebook with a job cluster, one notebook, Medications, failed with a timeout error . When I ran the Medication notebook with an interactive cluster, it passed.
The job and the interactive cluster have the same setup as shown below:
What could be the problem? The standard error from the spark driver logs is shown below:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-3958057957970596> in <module>()
1 #Run CDMMedications
----> 2 dbutils.notebook.run("CDMMedications", 0, {"TheScope":TheScope, "TheKey":TheKey, "StorageAccount":StorageAccount, "FileSystem":FileSystem, "Database":Database})
/local_disk0/tmp/1565905071244-0/dbutils.py in run(self, path, timeout_seconds, arguments, _NotebookHandler__databricks_internal_cluster_spec)
134 arguments,
135 __databricks_internal_cluster_spec,
--> 136 self.shell.currentJobGroup)
137
138 def __repr__(self):
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o779._run.
: com.databricks.WorkflowException: java.net.SocketTimeoutException: Read timed out
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:75)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:90)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl._run(NotebookUtilsImpl.scala:69)
at sun.reflect.GeneratedMethodAccessor605.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at sun.security.ssl.InputRecord.readFully(InputRecord.java:465)
at sun.security.ssl.InputRecord.read(InputRecord.java:503)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:975)
at sun.security.ssl.SSLSocketImpl.readDataRecord(SSLSocketImpl.java:933)
at sun.security.ssl.AppInputStream.read(AppInputStream.java:105)
at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137)
at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153)
at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:282)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:138)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:56)
at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:259)
at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:163)
at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:165)
at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:273)
at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:125)
at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:272)
at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185)
at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:111)
at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:72)
at com.databricks.common.client.RawDBHttpClient.httpRequestInternal(DBHttpClient.scala:498)
at com.databricks.common.client.RawDBHttpClient.entityEnclosingRequestInternal(DBHttpClient.scala:489)
at com.databricks.common.client.RawDBHttpClient.postInternal(DBHttpClient.scala:420)
at com.databricks.common.client.RawDBHttpClient.postJson(DBHttpClient.scala:283)
at com.databricks.common.client.DBHttpClient.postJson(DBHttpClient.scala:200)
at com.databricks.workflow.SimpleJobsSessionClient.createNotebookJob(JobsSessionClient.scala:160)
at com.databricks.workflow.ReliableJobsSessionClient$$anonfun$createNotebookJob$1.apply$mcJ$sp(JobsSessionClient.scala:249)
at com.databricks.workflow.ReliableJobsSessionClient$$anonfun$createNotebookJob$1.apply(JobsSessionClient.scala:249)
at com.databricks.workflow.ReliableJobsSessionClient$$anonfun$createNotebookJob$1.apply(JobsSessionClient.scala:249)
at com.databricks.common.client.DBHttpClient$.retryWithDeadline(DBHttpClient.scala:133)
at com.databricks.workflow.ReliableJobsSessionClient.withRetry(JobsSessionClient.scala:313)
at com.databricks.workflow.ReliableJobsSessionClient.createNotebookJob(JobsSessionClient.scala:248)
at com.databricks.workflow.WorkflowDriver.run0(WorkflowDriver.scala:93)
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:61)
... 12 more
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-3615515772639167> in <module>()
1 #Run CDMLoad
----> 2 dbutils.notebook.run("CDMLoads/CDMLoad",0,{"TheScope":TheScope,"TheKey":TheKey,"StorageAccount":StorageAccount, "FileSystem":FileSystem, "Database":Database})
/local_disk0/tmp/1565905071244-0/dbutils.py in run(self, path, timeout_seconds, arguments, _NotebookHandler__databricks_internal_cluster_spec)
134 arguments,
135 __databricks_internal_cluster_spec,
--> 136 self.shell.currentJobGroup)
137
138 def __repr__(self):
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o866._run.
: com.databricks.WorkflowException: com.databricks.NotebookExecutionException: FAILED
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:75)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:90)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl._run(NotebookUtilsImpl.scala:69)
at sun.reflect.GeneratedMethodAccessor605.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: com.databricks.NotebookExecutionException: FAILED
at com.databricks.workflow.WorkflowDriver.run0(WorkflowDriver.scala:118)
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:61)
... 12 more
The second parameter in your call to dbutils.notebook.run() is the seconds allowed before timing out. Looking at your error, it appears you have set it to 0.
dbutils.notebook.run("CDMMedications", 0, {"TheScope":TheScope,
"TheKey":TheKey, "StorageAccount":StorageAccount,
"FileSystem":FileSystem, "Database":Database})
Furthermore, the error also states Caused by: java.net.SocketTimeoutException: Read timed out.
From the docs for dbutils.notebook:
run(path: String, timeoutSeconds: int, arguments: Map): String -> This method runs a notebook and returns its exit value.
Try setting your timeoutSeconds to something like 300-600 and see how it goes. You might need to set it for as long as your longest job/notebook runs.
I fixed the problem by tuning the default spark configuration. I increase the executor heartbeat and the networko
spark.executor.heartbeat 60s
spark.network.timeout 720s

Error while running PageRank and BFS functions on Graphframes in PySpark

I'm new to Spark, and am learning it on the Cloudera Distr for Hadoop (CDH). I'm trying to execute the PageRank and BFS functions through Jupyter Notebook, which was initiated using the following command:
pyspark --packages graphframes:graphframes:0.1.0-spark1.6,com.databricks:spark-csv_2.11:1.2.0
The below is the PageRank function command I tried to run, along with the error message:
ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-20-34d549cc033e> in <module>()
----> 1 ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
2 ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20).show()
/tmp/spark-3bdc323d-a439-4f0a-ac1d-4e64ef4d1396/userFiles-0c248c5c-29fc-44c7-bfd9-3543500350dc/graphframes_graphframes-0.1.0-spark1.6.jar/graphframes/graphframe.pyc in pageRank(self, resetProbability, sourceId, maxIter, tol)
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o106.run.
: java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:50)
at org.apache.spark.graphx.lib.backport.PageRank$.log(PageRank.scala:65)
at org.apache.spark.Logging$class.logInfo(Logging.scala:58)
at org.apache.spark.graphx.lib.backport.PageRank$.logInfo(PageRank.scala:65)
at org.apache.spark.graphx.lib.backport.PageRank$.runWithOptions(PageRank.scala:148)
at org.graphframes.lib.PageRank$.run(PageRank.scala:130)
at org.graphframes.lib.PageRank.run(PageRank.scala:104)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
I'm getting the same error messages for the BFS function I'm trying:
filteredPaths = tripGraph.bfs(
fromExpr = "id = 'SEA'",
toExpr = "id = 'SFO'",
maxPathLength = 1)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-22-74394b11f50d> in <module>()
4 fromExpr = "id = 'SEA'",
5 toExpr = "id = 'SFO'",
----> 6 maxPathLength = 1)
7
8 filteredPaths.show()
/tmp/spark-3bdc323d-a439-4f0a-ac1d-4e64ef4d1396/userFiles-0c248c5c-29fc-44c7-bfd9-3543500350dc/graphframes_graphframes-0.1.0-spark1.6.jar/graphframes/graphframe.pyc in bfs(self, fromExpr, toExpr, edgeFilter, maxPathLength)
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o147.run.
: java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:50)
at org.graphframes.lib.BFS$.log(BFS.scala:131)
at org.apache.spark.Logging$class.logInfo(Logging.scala:58)
at org.graphframes.lib.BFS$.logInfo(BFS.scala:131)
at org.graphframes.lib.BFS$.org$graphframes$lib$BFS$$run(BFS.scala:212)
at org.graphframes.lib.BFS.run(BFS.scala:126)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Can you please let me know the issue?
Thanks, Sasi.
You are using incompatible Scala versions:
graphframes:graphframes:0.1.0-spark1.6 - Scala 2.10
com.databricks:spark-csv_2.11:1.2.0 - Scala 2.11
Spark installation - Probably Scala 2.10.
You have to use the same Scala version for all components (com.databricks:spark-csv_2.10:1.2.0 if Spark is compiled with Scala 2.10). Please consult Resolving dependency problems in Apache Spark for details.

Resources