Unable to create pyspark DataFrame from Datastore in azureml-sdk (version 1.12.0) - azure-machine-learning-service

I am trying to read contents from a CSV file into Spark DataFrame using azureml-sdk using following code but an exception is being thrown.
Code throwing exception
import pyspark.sql as spark
from azureml.core import Dataset
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, file_path)], header = False)
sdf: spark.DataFrame = dataset.to_spark_dataframe()
sdf.show()
Exception
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/data/dataset_error_handling.py in _try_execute(action, operation, dataset_info, **kwargs)
100 else:
--> 101 return action()
102 except Exception as e:
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/_loggerfactory.py in wrapper(*args, **kwargs)
178 try:
--> 179 return func(*args, **kwargs)
180 except Exception as e:
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py in to_spark_dataframe(self)
763 self._raise_if_missing_secrets()
--> 764 return self._spark_executor.get_dataframe(steps_to_block_datas(self._steps), use_sampling=False)
765
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/sparkexecution.py in get_dataframe(self, steps, use_sampling, overrides, use_first_record_schema)
136 overrides,
--> 137 use_first_record_schema)
138
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/sparkexecution.py in _execute(self, blocks, export_format, use_sampling, overrides, use_first_record_schema)
169 + lariat_version + '.')
--> 170 raise e
171
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/sparkexecution.py in _execute(self, blocks, export_format, use_sampling, overrides, use_first_record_schema)
160 if export_format == ExportScriptFormat.PYSPARKDATAFRAMELOADER:
--> 161 return module.LoadData(secrets=secrets, schemaFromFirstRecord=use_first_record_schema)
162 else:
/tmp/spark-6ce53791-c8e4-4db0-bd37-bedb53a1ef1e/userFiles-dda6cd30-5d1e-48cf-af87-9c7c2a4b8038/loaderb9bc01c2b40c4b7aa86a95d343021e0c.py in LoadData(secrets, schemaFromFirstRecord)
8 def LoadData(secrets=dict(), schemaFromFirstRecord=False):
----> 9 pex = Executor("S4ddf53ee8d5f4173bd3dcf4b51d78247", "dprep_2.11", "0.116.0", "42315", "39a925e4-9ae9-4588-93c4-5433250b7f73")
10 jex = pex.jex
/tmp/spark-6ce53791-c8e4-4db0-bd37-bedb53a1ef1e/userFiles-dda6cd30-5d1e-48cf-af87-9c7c2a4b8038/Executor.py in __init__(self, scalaName, dprepMavenPackageName, dprepMavenPackageMatchingVersion, pythonHostChannelPort, pythonHostSecret)
54 pythonHostChannelPort,
---> 55 pythonHostSecret)
56
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/py4j/java_gateway.py in __call__(self, *args)
1568 return_value = get_return_value(
-> 1569 answer, self._gateway_client, None, self._fqn)
1570
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
Py4JJavaError: An error occurred while calling None.com.microsoft.dprep.execution.PySparkExecutor.
: java.lang.NoClassDefFoundError: Could not initialize class com.microsoft.dprep.integration.azureml.AmlPySdkInvoker$
at com.microsoft.dprep.execution.PySparkExecutor.<init>(PySparkExecutor.scala:79)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
AzureMLException Traceback (most recent call last)
<ipython-input-30-c546b1aded42> in <module>
2 from azureml.core import Dataset
3 dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, file_path)], header = False)
----> 4 sdf: spark.DataFrame = dataset.to_spark_dataframe()
5 sdf.show()
6
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/data/_loggerfactory.py in wrapper(*args, **kwargs)
124 with _LoggerFactory.track_activity(logger, func.__name__, activity_type, custom_dimensions) as al:
125 try:
--> 126 return func(*args, **kwargs)
127 except Exception as e:
128 if hasattr(al, 'activity_info') and hasattr(e, 'error_code'):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/data/tabular_dataset.py in to_spark_dataframe(self)
187 return _try_execute(dataflow.to_spark_dataframe,
188 'to_spark_dataframe',
--> 189 None if self.id is None else {'id': self.id, 'name': self.name, 'version': self.version})
190
191 #track(_get_logger, custom_dimensions={'app_name': 'TabularDataset'}, activity_type=_PUBLIC_API)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/data/dataset_error_handling.py in _try_execute(action, operation, dataset_info, **kwargs)
102 except Exception as e:
103 message, is_dprep_exception = _construct_message_and_check_exception_type(e, dataset_info, operation)
--> 104 _dataprep_error_handler(e, message, is_dprep_exception)
105
106
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/data/dataset_error_handling.py in _dataprep_error_handler(e, message, is_dprep_exception)
143 raise AzureMLException(message, inner_exception=e)
144 else:
--> 145 raise AzureMLException(message, inner_exception=e)
146
147
AzureMLException: AzureMLException:
Message: Execution failed unexpectedly due to: Py4JJavaError
InnerException An error occurred while calling None.com.microsoft.dprep.execution.PySparkExecutor.
: java.lang.NoClassDefFoundError: Could not initialize class com.microsoft.dprep.integration.azureml.AmlPySdkInvoker$
at com.microsoft.dprep.execution.PySparkExecutor.<init>(PySparkExecutor.scala:79)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
ErrorResponse
{
"error": {
"message": "Execution failed unexpectedly due to: Py4JJavaError"
}
}
However, I can read and print the data with the following code i.e. create as a Panda's DataFrame.
Working code
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, file_path)], header = False)
#sdf: spark.DataFrame = dataset.to_spark_dataframe()
sdf: pd.DataFrame = dataset.to_pandas_dataframe()
print(sdf.head(3))

Dataset doesn't support Scala 2.12 runtime at the moment. The team is working on it and will address the feature gap soon. Stay tuned!

Related

Issue adding Word2Vec to Spark pipeline

I'm still getting used to Spark but I am having an issue figuring out how to build a pipeline. I have a spark dataframe below and my end goal is to classify each movie by reviewing their plot and classifying them.
dataframe
I am trying to create a pipeline using a stringIndexer, tokenizer, stopwordsremover and word2vec but I am getting the below error. I'm not sure how to resolve it after looking at some similar topics.
indexer = StringIndexer(inputCol="word", outputCol="label")
tokenizer = Tokenizer(inputCol = "plot_synopsis", outputCol = "tokenized_terms")
remover = StopWordsRemover(inputCol="tokenized_terms", outputCol="filtered")
word2Vec = Word2Vec(vectorSize=5, minCount=0, inputCol="filtered", outputCol="wordVectors")
pipeline = Pipeline(stages=[tokenizer, remover, word2Vec, indexer])
encodedData = pipeline.fit(df_expand).transform(df_expand)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-25-7d237f91c3cf> in <module>
----> 1 encodedData = pipeline.fit(df_expand).transform(df_expand)
~\anaconda3\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise TypeError("Params must be either a param map or a list/tuple of param maps, "
~\anaconda3\lib\site-packages\pyspark\ml\pipeline.py in _fit(self, dataset)
112 dataset = stage.transform(dataset)
113 else: # must be an Estimator
--> 114 model = stage.fit(dataset)
115 transformers.append(model)
116 if i < indexOfLastEstimator:
~\anaconda3\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise TypeError("Params must be either a param map or a list/tuple of param maps, "
~\anaconda3\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
~\anaconda3\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
~\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
~\anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
~\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o147.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 18.0 failed 1 times, most recent failure: Lost task 0.0 in stage 18.0 (TID 14) (host.docker.internal executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
at org.apache.spark.mllib.feature.Word2Vec.learnVocab(Word2Vec.scala:191)
at org.apache.spark.mllib.feature.Word2Vec.fit(Word2Vec.scala:312)
at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:182)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Unknown Source)
can you check once and make sure you are passing Datafrmae to fit() method.
Related issue:
PySpark :: FP-growth algorithm ( raise ValueError("Params must be either a param map or a list/tuple of param maps, ")

I get an error while reading multiple multiline JSON files in Databricks

I get an Py4JJavaError message when I'm reading multiple multiline JSON files from a folder. It seems like Spark is struggling while infering a schema from these files.
I tried to reduce the number of files to read since it has to infer from thousands of JSON files but it doesn't seem to work.
def get_user_details_schema(url):
df = sqlContext.read.json(url, multiLine=True)
return df.schema
This is the message I get :
Py4JJavaError Traceback (most recent call last)
<command-2296498238051133> in <module>()
19
20
---> 21 main()
<command-2296498238051133> in main()
15
16
---> 17 process_users(config.user_input_url, config.user_output_url)
18
19
<command-2296498238051133> in process_users(input_url, output_url)
1 def process_users(input_url, output_url):
----> 2 user_df = get_cleansed_users(input_url)
3
4 if not user_df or user_df.rdd.isEmpty():
5 print("User input dataset does not exists or is empty. Nothing to do.")
<command-2296498238051132> in get_cleansed_users(input_url)
16
17 def get_cleansed_users(input_url):
---> 18 df = read_if_exists(input_url, get_user_details_schema(input_url))
19
20 formater_date = udf(format_date)
<command-2296498238051132> in get_user_details_schema(url)
1 def get_user_details_schema(url):
----> 2 df = sqlContext.read.json(url, multiLine=True)
3
4 return df.schema
5
/databricks/spark/python/pyspark/sql/readwriter.py in json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, lineSep, samplingRatio, dropFieldIfAllNull, encoding)
272 path = [path]
273 if type(path) == list:
--> 274 return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
275 elif isinstance(path, RDD):
276 def func(iterator):
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o389.json.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, 10.10.25.4, executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2355)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2343)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2342)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2342)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1096)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1096)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1096)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2574)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2510)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:893)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2243)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2341)
at org.apache.spark.sql.catalyst.json.JsonInferSchema$.infer(JsonInferSchema.scala:83)
at org.apache.spark.sql.execution.datasources.json.MultiLineJsonDataSource$$anonfun$infer$1.apply(JsonDataSource.scala:172)
at org.apache.spark.sql.execution.datasources.json.MultiLineJsonDataSource$$anonfun$infer$1.apply(JsonDataSource.scala:172)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:240)
at org.apache.spark.sql.execution.datasources.json.MultiLineJsonDataSource$.infer(JsonDataSource.scala:171)
at org.apache.spark.sql.execution.datasources.json.JsonDataSource.inferSchema(JsonDataSource.scala:65)
at org.apache.spark.sql.execution.datasources.json.JsonFileFormat.inferSchema(JsonFileFormat.scala:59)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:204)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:195)
at scala.Option.orElse(Option.scala:289)
at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:195)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:412)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:298)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:284)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:467)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
If your cluster is running Databricks Runtime 4.0 and above, you can read JSON files in single-line or multi-line mode. In single-line mode, a file can be split into many parts and read in parallel.
Read JSON files in single-line:
val testJsonData = sqlContext.read.json("/tmp/test.json")
display(testJsonData)
Read JSON files in multi-line:
val testJsonData = sqlContext.read.option("multiline","true").json("/tmp/test.json")
display(testJsonData)
Reference: Azure Databricks - JSON Files
Hope this helps.

Databricks notebook time out error when calling other notebooks: com.databricks.WorkflowException: java.net.SocketTimeoutException: Read timed out

I have a main notebook that call a series of other notebook. Each notebook performs a MERGE on a delta table to update or insert new records on it.
When I ran the main notebook with a job cluster, one notebook, Medications, failed with a timeout error . When I ran the Medication notebook with an interactive cluster, it passed.
The job and the interactive cluster have the same setup as shown below:
What could be the problem? The standard error from the spark driver logs is shown below:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-3958057957970596> in <module>()
1 #Run CDMMedications
----> 2 dbutils.notebook.run("CDMMedications", 0, {"TheScope":TheScope, "TheKey":TheKey, "StorageAccount":StorageAccount, "FileSystem":FileSystem, "Database":Database})
/local_disk0/tmp/1565905071244-0/dbutils.py in run(self, path, timeout_seconds, arguments, _NotebookHandler__databricks_internal_cluster_spec)
134 arguments,
135 __databricks_internal_cluster_spec,
--> 136 self.shell.currentJobGroup)
137
138 def __repr__(self):
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o779._run.
: com.databricks.WorkflowException: java.net.SocketTimeoutException: Read timed out
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:75)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:90)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl._run(NotebookUtilsImpl.scala:69)
at sun.reflect.GeneratedMethodAccessor605.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at sun.security.ssl.InputRecord.readFully(InputRecord.java:465)
at sun.security.ssl.InputRecord.read(InputRecord.java:503)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:975)
at sun.security.ssl.SSLSocketImpl.readDataRecord(SSLSocketImpl.java:933)
at sun.security.ssl.AppInputStream.read(AppInputStream.java:105)
at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137)
at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153)
at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:282)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:138)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:56)
at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:259)
at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:163)
at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:165)
at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:273)
at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:125)
at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:272)
at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185)
at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:111)
at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:72)
at com.databricks.common.client.RawDBHttpClient.httpRequestInternal(DBHttpClient.scala:498)
at com.databricks.common.client.RawDBHttpClient.entityEnclosingRequestInternal(DBHttpClient.scala:489)
at com.databricks.common.client.RawDBHttpClient.postInternal(DBHttpClient.scala:420)
at com.databricks.common.client.RawDBHttpClient.postJson(DBHttpClient.scala:283)
at com.databricks.common.client.DBHttpClient.postJson(DBHttpClient.scala:200)
at com.databricks.workflow.SimpleJobsSessionClient.createNotebookJob(JobsSessionClient.scala:160)
at com.databricks.workflow.ReliableJobsSessionClient$$anonfun$createNotebookJob$1.apply$mcJ$sp(JobsSessionClient.scala:249)
at com.databricks.workflow.ReliableJobsSessionClient$$anonfun$createNotebookJob$1.apply(JobsSessionClient.scala:249)
at com.databricks.workflow.ReliableJobsSessionClient$$anonfun$createNotebookJob$1.apply(JobsSessionClient.scala:249)
at com.databricks.common.client.DBHttpClient$.retryWithDeadline(DBHttpClient.scala:133)
at com.databricks.workflow.ReliableJobsSessionClient.withRetry(JobsSessionClient.scala:313)
at com.databricks.workflow.ReliableJobsSessionClient.createNotebookJob(JobsSessionClient.scala:248)
at com.databricks.workflow.WorkflowDriver.run0(WorkflowDriver.scala:93)
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:61)
... 12 more
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-3615515772639167> in <module>()
1 #Run CDMLoad
----> 2 dbutils.notebook.run("CDMLoads/CDMLoad",0,{"TheScope":TheScope,"TheKey":TheKey,"StorageAccount":StorageAccount, "FileSystem":FileSystem, "Database":Database})
/local_disk0/tmp/1565905071244-0/dbutils.py in run(self, path, timeout_seconds, arguments, _NotebookHandler__databricks_internal_cluster_spec)
134 arguments,
135 __databricks_internal_cluster_spec,
--> 136 self.shell.currentJobGroup)
137
138 def __repr__(self):
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o866._run.
: com.databricks.WorkflowException: com.databricks.NotebookExecutionException: FAILED
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:75)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl.run(NotebookUtilsImpl.scala:90)
at com.databricks.dbutils_v1.impl.NotebookUtilsImpl._run(NotebookUtilsImpl.scala:69)
at sun.reflect.GeneratedMethodAccessor605.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: com.databricks.NotebookExecutionException: FAILED
at com.databricks.workflow.WorkflowDriver.run0(WorkflowDriver.scala:118)
at com.databricks.workflow.WorkflowDriver.run(WorkflowDriver.scala:61)
... 12 more
The second parameter in your call to dbutils.notebook.run() is the seconds allowed before timing out. Looking at your error, it appears you have set it to 0.
dbutils.notebook.run("CDMMedications", 0, {"TheScope":TheScope,
"TheKey":TheKey, "StorageAccount":StorageAccount,
"FileSystem":FileSystem, "Database":Database})
Furthermore, the error also states Caused by: java.net.SocketTimeoutException: Read timed out.
From the docs for dbutils.notebook:
run(path: String, timeoutSeconds: int, arguments: Map): String -> This method runs a notebook and returns its exit value.
Try setting your timeoutSeconds to something like 300-600 and see how it goes. You might need to set it for as long as your longest job/notebook runs.
I fixed the problem by tuning the default spark configuration. I increase the executor heartbeat and the networko
spark.executor.heartbeat 60s
spark.network.timeout 720s

Job cancelled because SparkContext was shut down

While running my spark program in jupyter notebook I got the error "Job cancelled because SparkContext was shut down".I am using spark without hadoop.The same program gave output earlier but now showing error.Any idea why would the error must have occured.
My code is :
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.read.json("Musical_Instruments_5.json")
pd=df.select(df['asin'],df['overall'],df['reviewerID'])
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for
column in list(set(pd.columns)-set(['overall'])) ]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(pd).transform(pd)
transformed.show()
(training,test)=transformed.randomSplit([0.8, 0.2])
als=ALS(maxIter=30,regParam=0.09,rank=25,userCol="reviewerID_index",itemCol="asin_index",ratingCol="overall",coldStartStrategy="drop",nonnegative=True)
model=als.fit(training)
This is the point where it gives error.
Py4JJavaError Traceback (most recent call last)
<ipython-input-14-2e31692d867d> in <module>()
1 #Fit ALS model to training data
----> 2 model=als.fit(training)
C:\spark\spark-2.3.1-bin-hadoop2.7\python\pyspark\ml\base.py in fit(self, dataset, params)
130 return self.copy(params)._fit(dataset)
131 else:
--> 132 return self._fit(dataset)
133 else:
134 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\spark\spark-2.3.1-bin-hadoop2.7\python\pyspark\ml\wrapper.py in _fit(self, dataset)
286
287 def _fit(self, dataset):
--> 288 java_model = self._fit_java(dataset)
289 model = self._create_model(java_model)
290 return self._copyValues(model)
C:\spark\spark-2.3.1-bin-hadoop2.7\python\pyspark\ml\wrapper.py in _fit_java(self, dataset)
283 """
284 self._transfer_params_to_java()
--> 285 return self._java_obj.fit(dataset._jdf)
286
287 def _fit(self, dataset):
C:\spark\spark-2.3.1-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
C:\spark\spark-2.3.1-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
C:\spark\spark-2.3.1-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o132.fit.
: org.apache.spark.SparkException: Job 11 cancelled because SparkContext was shut down
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:837)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:835)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:835)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1841)
at org.apache.spark.util.EventLoop.stop(EventLoop.scala:83)
at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1754)
at org.apache.spark.SparkContext$$anonfun$stop$8.apply$mcV$sp(SparkContext.scala:1931)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1360)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1930)
at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:573)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1991)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.rdd.RDD.count(RDD.scala:1162)
at org.apache.spark.ml.recommendation.ALS$.train(ALS.scala:1030)
at org.apache.spark.ml.recommendation.ALS.fit(ALS.scala:674)
at org.apache.spark.ml.recommendation.ALS.fit(ALS.scala:568)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
This problem is solved now.I have to create a checkpoint directory as number of iterations was more than 20 for training.
The code for creating checkpoint directory is:
SparkContext.setCheckpointDir("path to directory")

Error while running PageRank and BFS functions on Graphframes in PySpark

I'm new to Spark, and am learning it on the Cloudera Distr for Hadoop (CDH). I'm trying to execute the PageRank and BFS functions through Jupyter Notebook, which was initiated using the following command:
pyspark --packages graphframes:graphframes:0.1.0-spark1.6,com.databricks:spark-csv_2.11:1.2.0
The below is the PageRank function command I tried to run, along with the error message:
ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-20-34d549cc033e> in <module>()
----> 1 ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
2 ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20).show()
/tmp/spark-3bdc323d-a439-4f0a-ac1d-4e64ef4d1396/userFiles-0c248c5c-29fc-44c7-bfd9-3543500350dc/graphframes_graphframes-0.1.0-spark1.6.jar/graphframes/graphframe.pyc in pageRank(self, resetProbability, sourceId, maxIter, tol)
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o106.run.
: java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:50)
at org.apache.spark.graphx.lib.backport.PageRank$.log(PageRank.scala:65)
at org.apache.spark.Logging$class.logInfo(Logging.scala:58)
at org.apache.spark.graphx.lib.backport.PageRank$.logInfo(PageRank.scala:65)
at org.apache.spark.graphx.lib.backport.PageRank$.runWithOptions(PageRank.scala:148)
at org.graphframes.lib.PageRank$.run(PageRank.scala:130)
at org.graphframes.lib.PageRank.run(PageRank.scala:104)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
I'm getting the same error messages for the BFS function I'm trying:
filteredPaths = tripGraph.bfs(
fromExpr = "id = 'SEA'",
toExpr = "id = 'SFO'",
maxPathLength = 1)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-22-74394b11f50d> in <module>()
4 fromExpr = "id = 'SEA'",
5 toExpr = "id = 'SFO'",
----> 6 maxPathLength = 1)
7
8 filteredPaths.show()
/tmp/spark-3bdc323d-a439-4f0a-ac1d-4e64ef4d1396/userFiles-0c248c5c-29fc-44c7-bfd9-3543500350dc/graphframes_graphframes-0.1.0-spark1.6.jar/graphframes/graphframe.pyc in bfs(self, fromExpr, toExpr, edgeFilter, maxPathLength)
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o147.run.
: java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:50)
at org.graphframes.lib.BFS$.log(BFS.scala:131)
at org.apache.spark.Logging$class.logInfo(Logging.scala:58)
at org.graphframes.lib.BFS$.logInfo(BFS.scala:131)
at org.graphframes.lib.BFS$.org$graphframes$lib$BFS$$run(BFS.scala:212)
at org.graphframes.lib.BFS.run(BFS.scala:126)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Can you please let me know the issue?
Thanks, Sasi.
You are using incompatible Scala versions:
graphframes:graphframes:0.1.0-spark1.6 - Scala 2.10
com.databricks:spark-csv_2.11:1.2.0 - Scala 2.11
Spark installation - Probably Scala 2.10.
You have to use the same Scala version for all components (com.databricks:spark-csv_2.10:1.2.0 if Spark is compiled with Scala 2.10). Please consult Resolving dependency problems in Apache Spark for details.

Resources