MapR Stream and PySpark - apache-spark

Does PySpark work (compatible) for MapR Streams?
Any example code?
I've tried that but keep getting exception
strLoc = '/Path1:Stream1'
protocol = 'file://' if ( strLoc.startswith('/') or strLoc.startswith('\\') ) else ''
from pyspark.streaming.kafka import *;
from pyspark import StorageLevel;
APA = KafkaUtils.createDirectStream(ssc, [strLoc], kafkaParams={ \
"oracle.odi.prefer.dataserver.packages" : "" \
,"key.deserializer" : "org.apache.kafka.common.serialization.StringDeserializer" \
,"value.deserializer" : "org.apache.kafka.common.serialization.ByteArrayDeserializer" \
,"zookeeper.connect" : "maprdemo:5181" \
,"metadata.broker.list" : "this.will.be.ignored:9092"
,"group.id" : "New_Mapping_2_Physical"}, fromOffsets=None, messageHandler=None)
Traceback (most recent call last):
File "/tmp/New_Mapping_2_Physical.py", line 77, in <module>
,"group.id" : "New_Mapping_2_Physical"}, fromOffsets=None, messageHandler=None)
File "/opt/mapr/spark/spark-1.6.1/python/lib/pyspark.zip/pyspark/streaming/kafka.py", line 152, in createDirectStream
py4j.protocol.Py4JJavaError: An error occurred while calling o58.createDirectStreamWithoutMessageHandler.
: org.apache.spark.SparkException: java.nio.channels.ClosedChannelException
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$checkErrors$1.apply(KafkaCluster.scala:366)
at scala.util.Either.fold(Either.scala:97)
at org.apache.spark.streaming.kafka.KafkaCluster$.checkErrors(KafkaCluster.scala:365)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:222)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStream(KafkaUtils.scala:720)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createDirectStreamWithoutMessageHandler(KafkaUtils.scala:688)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
On Scala, it seems to work fine, but on PySpark, not.

I downloaded the latest build http://package.mapr.com/releases/ecosystem-5.x/redhat/mapr-spark-1.6.1.201612010646-1.noarch.rpm and it resolved the issue.
I've checked the the pyspark kafka.py, and found it updated. I was using label 1605, now 1611.

Related

read a text file from S3 into a Spark df : UsupportedOperationException

I am trying to read a text file from on-prem s3 compatible object storage using Spark and I am getting an error stating: UsupportedOperationException. I am unsure what this is pointing to and have tried to adjust code thinking maybe it was the spark.read command. I have tried read.text and read.csv both of which should work, but result in the same error. Full stack trace is below along with code:
Code being used:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("s3reader") \
.getOrCreate()\
sc = spark.sparkContext
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key","xxxxxxxxxxxx")
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "xxxxxxxxxxxxxx")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
df = spark.read.text("https://s3a.us-east-1.xxxx.xxxx.xxxx.com/bronze/xxxxxxx/test.txt")
print(df)
Stack trace:
Traceback (most recent call last):
File "/home/cloud/sparks3test.py", line 19, in <module>
df = spark.read.text("https://s3a.us-east-1.tpavcps3ednrg1.vici.verizon.com/bronze/CoreMetrics/test.txt")
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 516, in text
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
File "/usr/local/bin/spark-3.1.2-bin-hadoop3.2/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o31.text.
: java.lang.UnsupportedOperationException
at org.apache.hadoop.fs.http.AbstractHttpFileSystem.listStatus(AbstractHttpFileSystem.java:91)
at org.apache.hadoop.fs.http.HttpsFileSystem.listStatus(HttpsFileSystem.java:23)
at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225)
at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:238)
at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:158)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:131)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:94)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:66)
at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:581)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:417)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:325)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:307)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:307)
at org.apache.spark.sql.DataFrameReader.text(DataFrameReader.scala:944)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:829)```
Try reading file from S3 like below.
s3a://bucket/bronze/xxxxxxx/test.txt

How to use MariaDB Connector/J with Pyspark for JDBC?

I'm using Pyspark Spark 3.0.1 on Ubuntu 18.04 and want to export data to a MariaDB server using JDBC.
I'm specifying the Connector/J jar on the pyspark command line like this:
$ pyspark --jars /usr/share/java/mariadb-java-client.jar
However, when I to use the JDBC connection I get the following error:
>>> df1 = sc.parallelize([[1,2,3], [2,3,4]]).toDF(("a", "b", "c"))
>>> df1.write.format("jdbc") \
... .mode("overwrite") \
... .option("url", "jdbc:mariadb://localhost:3306/testDatabase?user=foo&password=bar") \
... .option("dbtable", "example") \
... .save()
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "/opt/spark/python/pyspark/sql/readwriter.py", line 825, in save
self._jwrite.save()
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
File "/opt/spark/python/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o60.save.
: java.sql.SQLException: No suitable driver
at java.sql.DriverManager.getDriver(DriverManager.java:315)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$2(JDBCOptions.scala:105)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:105)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:194)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:198)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:45)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:963)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:963)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:415)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:399)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
>>>
Because of java.sql.SQLException: No suitable driver I assume I need some additional configuration for Connector/J to be invoked. I'm not seeing how to to it though. What's the trick?
You need to specifiy the mariadb driver class org.mariadb.jdbc.Driver using driver option when writing:
df1.write.format("jdbc") \
.mode("overwrite") \
.option("driver", "org.mariadb.jdbc.Driver") \
.option("url", "jdbc:mysql://localhost:3306/testDatabase?user=foo&password=bar") \
.option("dbtable", "example") \
.save()
See Usage in the docs.
For anyone still facing this error,
use mysql in url, not mariadb.
the jdbc url should be like jdbc:mysql:{host} ... in place of jdbc:mariadb:{host} ....

When Spark call Hive from oozie, exception raised "java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveException"

I have spark job that save data to hdfs then it save the same data to Hive table. When I run it on Jupyter, it run succesfully. But when I run it through oozie It raises the folloing exception when it reaches the step of writing data to hive.
Here is my code followed by the exception and followed by workflow .xml:
# coding: utf-8
# In[10]:
import os
JARS_HOME = "hdfs:///dataengineering/jars"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars '+JARS_HOME+'/ojdbc6.jar,'+JARS_HOME+'/anonymize_udfs.jar pyspark-shell'
os.environ["HADOOP_CONF_DIR"] = '/etc/hadoop/conf'
# In[11]:
try:
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
except:
import findspark
findspark.init('/opt/cloudera/parcels/CDH-6.1.1-1.cdh6.1.1.p0.875250/lib/spark')
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
import sys
import pyspark.sql.functions as functions
from datetime import date
from dateutil.relativedelta import relativedelta
from datetime import datetime
from datetime import timedelta
from pyspark.sql.types import StringType
from pyspark.sql.functions import *
from pyspark.sql import functions as sf
from pyspark.sql.types import StringType
spark = SparkSession.builder .master("yarn") .appName("oozie_sample_spark") .config('spark.executor.cores','3') .config('spark.executor.memory','15g') .config('spark.driver.memory','5g') .config('spark.driver.maxResultSize','12g') .config("spark.dynamicAllocation.enabled", "true") .config("spark.shuffle.service.enabled", "true") .config("spark.executor.instances", "4") .config("spark.yarn.queue", "root.STREAMING") .config("spark.dynamicAllocation.cachedExecutorIdleTimeout", "300s") .config("hive.metastore.uris", "thrift://dchqmaster01.internal.eg.vodafone.com:9083") .getOrCreate()
# In[13]:
spark.sql("select current_timestamp() column_a").write.csv("/user/akhamis11/oozie-samples/spark-sample/current_column.csv", mode='append')
spark.sql("select current_timestamp() column_a").write.saveAsTable("bde.oozie_test", mode='append')
spark.stop()
2020-04-13 07:27:21,077 [dispatcher-event-loop-1] INFO org.apache.spark.deploy.yarn.YarnAllocator - Driver requested a total number of 0 executor(s).
2020-04-13 07:27:21,081 [Thread-10] INFO org.apache.spark.sql.execution.datasources.FileFormatWriter - Write Job 316245be-4c54-42d9-bd43-6246d77672b0 committed.
2020-04-13 07:27:21,108 [Thread-10] INFO org.apache.spark.sql.execution.datasources.FileFormatWriter - Finished processing stats for write job 316245be-4c54-42d9-bd43-6246d77672b0.
2020-04-13 07:27:21,191 [Thread-10] INFO com.cloudera.spark.lineage.NavigatorQueryListener - Failed to generate lineage for successful query execution.
java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:192)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:103)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:102)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.org$apache$spark$sql$hive$HiveSessionStateBuilder$$externalCatalog(HiveSessionStateBuilder.scala:39)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:90)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:90)
at org.apache.spark.sql.query.analysis.QueryAnalysis$.hiveCatalog(QueryAnalysis.scala:63)
at org.apache.spark.sql.query.analysis.QueryAnalysis$.getLineageInfo(QueryAnalysis.scala:88)
at com.cloudera.spark.lineage.NavigatorQueryListener.onSuccess(ClouderaNavigatorListener.scala:60)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1$$anonfun$apply$mcV$sp$1.apply(QueryExecutionListener.scala:124)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1$$anonfun$apply$mcV$sp$1.apply(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling$1.apply(QueryExecutionListener.scala:145)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling$1.apply(QueryExecutionListener.scala:143)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.collection.mutable.ListBuffer.foreach(ListBuffer.scala:45)
at org.apache.spark.sql.util.ExecutionListenerManager.org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling(QueryExecutionListener.scala:143)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply$mcV$sp(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply(QueryExecutionListener.scala:123)
at org.apache.spark.sql.util.ExecutionListenerManager.readLock(QueryExecutionListener.scala:156)
at org.apache.spark.sql.util.ExecutionListenerManager.onSuccess(QueryExecutionListener.scala:122)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:670)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:656)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:189)
... 39 more
Caused by: java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveException
at org.apache.spark.sql.hive.HiveExternalCatalog.<init>(HiveExternalCatalog.scala:73)
... 44 more
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.ql.metadata.HiveException
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 45 more
Traceback (most recent call last):
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o148.saveAsTable.
: java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:192)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:103)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:102)
at org.apache.spark.sql.hive.HiveSessionStateBuilder.org$apache$spark$sql$hive$HiveSessionStateBuilder$$externalCatalog(HiveSessionStateBuilder.scala:39)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$1.apply(HiveSessionStateBuilder.scala:54)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:90)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:90)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.tableExists(SessionCatalog.scala:415)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:405)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:400)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.internal.SharedState$.org$apache$spark$sql$internal$SharedState$$reflect(SharedState.scala:189)
... 21 more
Caused by: java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/metadata/HiveException
at org.apache.spark.sql.hive.HiveExternalCatalog.<init>(HiveExternalCatalog.scala:73)
... 26 more
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "oozie_sample_spark.py", line 53, in <module>
spark.sql("select current_timestamp() column_a").write.saveAsTable("bde.oozie_test", mode='append')
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/pyspark.zip/pyspark/sql/readwriter.py", line 775, in saveAsTable
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/disk10/yarn/nm/usercache/akhamis11/appcache/application_1586733850175_0103/container_1586733850175_0103_02_000001/pyspark.zip/pyspark/sql/utils.py", line 79, in deco
pyspark.sql.utils.IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog':"
<action name='spark-node'>
<spark xmlns="uri:oozie:spark-action:1.0">
<resource-manager>${resourceManager}</resource-manager>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.service.HCatAccessorService.hcat.configuration</name>
<value>/opt/cloudera/parcels/CDH/etc/hive/conf.dist/hive-site.xml</value>
</property>
</configuration>
<master>${master}</master>
<name>oozies_sample</name>
<jar>${nameNode}/user/${wf:user()}/oozie-samples/spark-sample/lib/oozie_sample_spark.py</jar>
</spark>
<ok to="end" />
<error to="fail" />
</action>
<kill name="fail">
<message>Workflow failed, error
message[${wf:errorMessage(wf:lastErrorNode())}]
</message>
</kill>
<end name='end' />
[~]$ hdfs dfs -ls /user/oozie/share/lib/lib_<ts>/spark2
/user/oozie/share/lib/lib_<ts>/spark2/HikariCP-java7-2.4.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/RoaringBitmap-0.5.11.jar
/user/oozie/share/lib/lib_<ts>/spark2/accessors-smart-1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/activation-1.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/annotations-2.0.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/antlr4-runtime-4.7.jar
/user/oozie/share/lib/lib_<ts>/spark2/aopalliance-1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/aopalliance-repackaged-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/arpack_combined_all-0.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/arrow-format-0.10.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/arrow-memory-0.10.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/arrow-vector-0.10.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/audience-annotations-0.5.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/avro-ipc.jar
/user/oozie/share/lib/lib_<ts>/spark2/avro-mapred-hadoop2.jar
/user/oozie/share/lib/lib_<ts>/spark2/avro.jar
/user/oozie/share/lib/lib_<ts>/spark2/aws-java-sdk-bundle-1.11.271.jar
/user/oozie/share/lib/lib_<ts>/spark2/azure-keyvault-core-0.8.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/azure-storage-5.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/breeze-macros_2.11-0.13.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/breeze_2.11-0.13.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/chill-java-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/chill_2.11-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-beanutils-1.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-cli-1.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-codec-1.10.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-collections-3.2.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-compiler-3.0.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-compress-1.4.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-configuration2-2.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-crypto-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-io-2.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-lang-2.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-lang3-3.7.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-logging-1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-math3-3.4.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-net-3.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/commons-pool-1.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/compress-lzf-1.0.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/core-1.1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/curator-client-2.7.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/curator-framework-2.7.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/curator-recipes-2.7.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/datanucleus-core-4.1.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/derby-10.14.1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/ehcache-3.3.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flatbuffers-1.2.0-3f79e055.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-config-filter-api-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-configuration-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-core-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/flume-ng-sdk-1.8.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/geronimo-jcache_1.0_spec-1.0-alpha-1.jar
/user/oozie/share/lib/lib_<ts>/spark2/gson-2.2.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/guava-11.0.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/guice-4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/guice-servlet-4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-annotations.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-auth.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-aws.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-azure-3.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-client-3.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-hdfs-client.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-mapreduce-client-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-mapreduce-client-core.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-mapreduce-client-jobclient.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-openstack-3.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-api.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-client.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-server-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hadoop-yarn-server-web-proxy.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-exec.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-hcatalog-core.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-hcatalog-pig-adapter.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-metastore.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-serde.jar
/user/oozie/share/lib/lib_<ts>/spark2/hive-site.xml
/user/oozie/share/lib/lib_<ts>/spark2/hive-webhcat-java-client.jar
/user/oozie/share/lib/lib_<ts>/spark2/hk2-api-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/hk2-locator-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/hk2-utils-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/hppc-0.7.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/htrace-core4-4.1.0-incubating.jar
/user/oozie/share/lib/lib_<ts>/spark2/httpclient-4.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/httpcore-4.4.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/ivy-2.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-annotations-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-core-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-core-asl-1.9.13.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-databind-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-dataformat-cbor-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-jaxrs-base-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-jaxrs-json-provider-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-mapper-asl-1.9.13-cloudera.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-module-jaxb-annotations-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-module-paranamer-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/jackson-module-scala_2.11-2.9.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/janino-3.0.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/javassist-3.18.1-GA.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.annotation-api-1.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.inject-1.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.inject-2.4.0-b34.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.servlet-api-3.1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/javax.ws.rs-api-2.0.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jaxb-api-2.2.11.jar
/user/oozie/share/lib/lib_<ts>/spark2/jcip-annotations-1.0-1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jcl-over-slf4j-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-client-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-common-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-container-servlet-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-container-servlet-core-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-guava-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-media-jaxb-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jersey-server-2.22.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-jmx-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-util-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-util-ajax-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-webapp-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/jetty-xml-9.3.20.v20170531.jar
/user/oozie/share/lib/lib_<ts>/spark2/joda-time-2.9.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/jodd-core-3.5.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/jsch-0.1.54.jar
/user/oozie/share/lib/lib_<ts>/spark2/json-smart-2.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-ast_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-core_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-jackson_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/json4s-scalap_2.11-3.5.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/jsp-api-2.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/jsr305-1.3.9.jar
/user/oozie/share/lib/lib_<ts>/spark2/jtransforms-2.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/jul-to-slf4j-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/kafka-clients-2.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-admin-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-client-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-common-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-core-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-crypto-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-identity-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-server-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-simplekdc-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerb-util-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-asn1-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-config-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-pkix-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-util-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kerby-xdr-1.0.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/kryo-shaded-4.0.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/leveldbjni-all-1.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/libfb303-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/libthrift-0.9.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/log4j-1.2.17.jar
/user/oozie/share/lib/lib_<ts>/spark2/lz4-java-1.4.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/machinist_2.11-0.6.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/macro-compat_2.11-1.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-core-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-graphite-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-json-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/metrics-jvm-3.1.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/mina-core-2.0.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/minlog-1.3.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/mssql-jdbc-6.2.1.jre7.jar
/user/oozie/share/lib/lib_<ts>/spark2/netty-3.10.6.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/netty-all-4.1.17.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/nimbus-jose-jwt-4.41.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/objenesis-2.5.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/okhttp-2.7.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/okio-1.6.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/oozie-sharelib-spark-5.0.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/oozie-sharelib-spark.jar
/user/oozie/share/lib/lib_<ts>/spark2/opencsv-2.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/oro-2.0.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/osgi-resource-locator-1.0.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/paranamer-2.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-column.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-common.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-encoding.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-format.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-hadoop.jar
/user/oozie/share/lib/lib_<ts>/spark2/parquet-jackson.jar
/user/oozie/share/lib/lib_<ts>/spark2/protobuf-java-2.5.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/py4j-0.10.7-src.zip
/user/oozie/share/lib/lib_<ts>/spark2/py4j-0.10.7.jar
/user/oozie/share/lib/lib_<ts>/spark2/pyrolite-4.13.jar
/user/oozie/share/lib/lib_<ts>/spark2/pyspark.zip
/user/oozie/share/lib/lib_<ts>/spark2/re2j-1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-compiler-2.11.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-library-2.11.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-parser-combinators_2.11-1.1.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-reflect-2.11.12.jar
/user/oozie/share/lib/lib_<ts>/spark2/scala-xml_2.11-1.0.5.jar
/user/oozie/share/lib/lib_<ts>/spark2/shapeless_2.11-2.3.2.jar
/user/oozie/share/lib/lib_<ts>/spark2/slf4j-api-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/slf4j-log4j12-1.7.25.jar
/user/oozie/share/lib/lib_<ts>/spark2/snappy-java-1.1.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-avro_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-catalyst_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-core_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-graphx_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-hadoop-cloud_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-hive_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-kvstore_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-launcher_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-lineage_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-mllib-local_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-mllib_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-network-common_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-network-shuffle_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-repl_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-sketch_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-sql-kafka-0-10_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-sql_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming-flume-sink_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming-flume_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming-kafka-0-10_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-streaming_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-tags_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-unsafe_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spark-yarn_2.11-2.4.0-cdh6.1.1.jar
/user/oozie/share/lib/lib_<ts>/spark2/spire-macros_2.11-0.13.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/spire_2.11-0.13.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/stax2-api-3.1.4.jar
/user/oozie/share/lib/lib_<ts>/spark2/stream-2.7.0.jar
/user/oozie/share/lib/lib_<ts>/spark2/univocity-parsers-2.7.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/validation-api-1.1.0.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/wildfly-openssl-1.0.4.Final.jar
/user/oozie/share/lib/lib_<ts>/spark2/woodstox-core-5.0.3.jar
/user/oozie/share/lib/lib_<ts>/spark2/xbean-asm6-shaded-4.8.jar
/user/oozie/share/lib/lib_<ts>/spark2/xz-1.6.jar
/user/oozie/share/lib/lib_<ts>/spark2/zookeeper.jar
/user/oozie/share/lib/lib_<ts>/spark2/zstd-jni-1.3.2-2.jar
Error while instantiating 'org.apache.spark.sql.hive.HiveExternalCatalog' This means the Catalog jar, its trying find is not in ooziesharelib spark directory.
Please add the following property in your job.properties file.
oozie.action.sharelib.for.spark=hive,spark,hcatalog <spark2 instead of spark if you are using cloudera>
This will allow hive jars to use in spark action. For external hive catalog spark oozieShareLib dirctory does not contain every jars.
org/apache/commons/dbcp/ConnectionFactory releated jar lives under hive lib folder.
So please check weather the jar exists in your local file system as well as in hdfs
find <location> -name "*.jar" | xargs grep ConnectionFactory
Also please add this in your SparkSession to enable hive support for spark sql .enableHiveSupport()

Error while running python program for spark streaming with kafka

I am using latest spark (2.1.0) and python (3.5.3) installed. I have kafka (2.10.0) installed locally.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pykafka import KafkaClient
import json
import sys
import pprint
spsc = SparkContext(appName="SampleApp")
stsc = StreamingContext(spsc, 1)
print('contexts =================== {} {}'.format(spsc,stsc));
kvs = KafkaUtils.createStream(stsc, "localhost:2181", "spark-consumer", {"7T-test3": 1})
spsc.stop()
Here 'print' line executes fine. But on next line while creating stream I get following error,
Traceback (most recent call last):
File "/Users/MacAdmin/Downloads/spark-streaming/spark/spark_streaming_osample.py", line 24, in <module>
kvs = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"7T-test3": 1})
File "/Users/MacAdmin/Documents/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/streaming/kafka.py", line 70, in createStream
File "/Users/MacAdmin/Documents/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/Users/MacAdmin/Documents/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o25.createStream.
: java.lang.NoClassDefFoundError: org/apache/spark/Logging
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.streaming.kafka.KafkaUtils$.createStream(KafkaUtils.scala:91)
at org.apache.spark.streaming.kafka.KafkaUtils$.createStream(KafkaUtils.scala:168)
at org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper.createStream(KafkaUtils.scala:632)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.Logging
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 25 more
I run my program from command line as
/Users/MacAdmin/Documents/spark-2.1.0-bin-hadoop2.7/bin/spark-submit --jars spark-streaming-kafka-assembly_2.10-1.6.3.jar spark_streaming_sample.py
Do I need any environment variable or I am not using correct library versions?
Few things were missing, added classpaths
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip/:$PYTHONPATH
And spark logging is private from 2.* onwards so had to use below kafka streaming version while running program
spark-streaming-kafka-0-8-assembly_2.10-2.1.0.jar
Make sure that you have the topic created (7T-test3) in Kafka before executing the stream.
You may also want to provide more details leading up to the error.

RDD doesn't work

I' m currently working on a project and can't seem to overcome an error in spark.
function like .first() and .collect() won't give results.
this is my code:
import os
import sys
# Path for spark source folder
os.environ['SPARK_HOME']="C:\spark-2.0.1-bin-hadoop2.7"
# Append pyspark to Python Path
sys.path.append("C:\spark-2.0.1-bin-hadoop2.7\python ")
try:
from pyspark import SparkContext
from pyspark import SparkConf
print ("Successfully imported Spark Modules")
except ImportError as e:
print ("Can not import Spark Modules", e)
sys.exit(1)
import re
sc = SparkContext()
file = sc.textFile('rC:\\essay.txt')
word = file.map(lambda line: re.split(r'[?:\n|\s]\s*', line))
word.first()
when i run it on pycharm. It generates the following:
Successfully imported Spark Modules
16/12/18 17:23:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
16/12/18 17:23:43 WARN SizeEstimator: Failed to check whether UseCompressedOops is set; assuming yes
Traceback (most recent call last):
File "C:/Users/User1/PycharmProjects/BigData/SparkMatrice.py", line 43, in <module>
word.first()
File "C:\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 1328, in first
File "C:\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 1280, in take
File "C:\spark-2.0.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\rdd.py", line 2388, in getNumPartitions
File "C:\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\java_gateway.py", line 1133, in __call__
File "C:\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o19.partitions.
: java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: rC:%5Cessay.txt
at org.apache.hadoop.fs.Path.initialize(Path.java:205)
at org.apache.hadoop.fs.Path.<init>(Path.java:171)
at org.apache.hadoop.util.StringUtils.stringToPath(StringUtils.java:245)
at org.apache.hadoop.mapred.FileInputFormat.setInputPaths(FileInputFormat.java:411)
at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$29.apply(SparkContext.scala:992)
at org.apache.spark.SparkContext$$anonfun$hadoopFile$1$$anonfun$29.apply(SparkContext.scala:992)
at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
at org.apache.spark.rdd.HadoopRDD$$anonfun$getJobConf$6.apply(HadoopRDD.scala:176)
at scala.Option.map(Option.scala:146)
at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:176)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:195)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:246)
at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:60)
at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
Caused by: java.net.URISyntaxException: Relative path in absolute URI: rC:%5Cessay.txt
at java.net.URI.checkPath(Unknown Source)
at java.net.URI.<init>(Unknown Source)
at org.apache.hadoop.fs.Path.initialize(Path.java:202)
... 32 more
Same thing happens when i replace .first() with .collect().(same thing happens when i use the terminal instead of pycharm).
I hope that someone can help me figure out what is wrong.
The problem is listed there for you, your path is wrong:
Caused by: java.net.URISyntaxException: Relative path in absolute URI: rC:%5Cessay.txt
at java.net.URI.checkPath(Unknown Source)
You need to change
file = sc.textFile('rC:\\essay.txt')
to
file = sc.textFile(r'C:\\essay.txt')

Resources