Spark Hive reporting java.lang.NoSuchMethodError: org.apache.hadoop.hive.metastore.api.Table.setTableName(Ljava/lang/String;)V - apache-spark

I am trying to use SparkSession to reading data from Hive.
my code:
val warehouseLocation = "/user/xx/warehouse"
val spark = SparkSession
.builder()
.master("local[*]")
.appName("HiveReceiver")
.config("spark.sql.warehouse.dir",warehouseLocation)
.enableHiveSupport()
.getOrCreate()
import spark.sql
sql("select * from sparktest.test").show()
spark.stop()
my versions:
spark:2.1.1
hive:1.2.1
hadoop:2.7.1
but there are some Exceptions when it run in IDEA:
Exception in thread "main" java.lang.NoSuchMethodError:
org.apache.hadoop.hive.metastore.api.Table.setTableName(Ljava/lang/String;)V
at
org.apache.spark.sql.hive.MetastoreRelation.(MetastoreRelation.scala:76)
at
org.apache.spark.sql.hive.HiveMetastoreCatalog.lookupRelation(HiveMetastoreCatalog.scala:142)
at
org.apache.spark.sql.hive.HiveSessionCatalog.lookupRelation(HiveSessionCatalog.scala:70)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveRelations$$lookupTableFromCatalog(Analyzer.scala:457)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:479)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:464)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:61)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:61)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:60)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:58)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:58)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:307)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:305)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:58)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:464)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:454)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
at
scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
at scala.collection.immutable.List.foldLeft(List.scala:84) at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
at scala.collection.immutable.List.foreach(List.scala:381) at
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
at
org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:69)
at
org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:67)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:50)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:63) at
org.apache.spark.sql.SparkSession.sql(SparkSession.scala:592) at
com.bdp.steaming.HiveReceiver$.main(HiveReceiver.scala:24) at
com.bdp.steaming.HiveReceiver.main(HiveReceiver.scala)
someone can tell where is the bug?

I have solved this question.In my case,there are two hive-metastore dependencies in my project,then i excluded a hive-metastore dependency.It worked.

Related

NoSuchMethodError trying to ingest HDFS data into Elasticsearch

I'm using Spark 3.12, Scala 2.12, Hadoop 3.1.1.3.1.2-50, Elasticsearch 7.10.1 (due to license issues), Centos 7
to try an ingest json data in gzip files located on HDFS into Elasticsearch using spark streaming.
I get a
Logical Plan:
FileStreamSource[hdfs://pct/user/papago-mlops-datalake/raw/mt-log/engine=n2mt/year=2022/date=0430/hour=00]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:356)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:244)
Caused by: java.lang.NoSuchMethodError: org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(Lorg/apache/spark/sql/SparkSession;Lorg/apache/spark/sql/execution/QueryExecution;Lscala/Function0;)Ljava/lang/Object;
at org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink.addBatch(EsSparkSqlStreamingSink.scala:62)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:586)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:584)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:584)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:226)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:194)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:188)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:334)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:317)
... 1 more
ApplicationMaster host: ac3m8x2183.bdp.bdata.ai
ApplicationMaster RPC port: 39673
queue: batch
start time: 1654588583366
final status: FAILED
tracking URL: https://gemini-rm2.bdp.bdata.ai:9090/proxy/application_1654575947385_29572/
user: papago-mlops-datalake
Exception in thread "main" org.apache.spark.SparkException: Application application_1654575947385_29572 finished with failed status
at org.apache.spark.deploy.yarn.Client.run(Client.scala:1269)
at org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1627)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:904)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
using
implementation("org.elasticsearch:elasticsearch-hadoop:8.2.2")
implementation("com.typesafe:config:1.4.2")
implementation("org.apache.spark:spark-sql_2.12:3.1.2")
testImplementation("org.scalatest:scalatest_2.12:3.2.12")
testRuntimeOnly("com.vladsch.flexmark:flexmark-all:0.61.0")
compileOnly("org.apache.spark:spark-sql_2.12:3.1.2")
compileOnly("org.apache.spark:spark-core_2.12:3.1.2")
compileOnly("org.apache.spark:spark-launcher_2.12:3.1.2")
compileOnly("org.apache.spark:spark-streaming_2.12:3.1.2")
compileOnly("org.elasticsearch:elasticsearch-spark-30_2.12:8.2.2")
libraries. I tried using ES-Hadoop version 7.10.1, but ES-Spark only supports down to 7.12.0 for Spark 3.0 and I still get the same error.
My code is pretty simple
def main(args: Array[String]): Unit = {
// Set the log level to only print errors
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession
.builder()
.config(ConfigurationOptions.ES_NET_HTTP_AUTH_USER, elasticsearchUser)
.config(ConfigurationOptions.ES_NET_HTTP_AUTH_PASS, elasticsearchPass)
.config(ConfigurationOptions.ES_NODES, elasticsearchHost)
.config(ConfigurationOptions.ES_PORT, elasticsearchPort)
.appName(appName)
.master(master)
.getOrCreate()
val streamingDF: DataFrame = spark.readStream
.schema(jsonSchema)
.format("org.apache.spark.sql.execution.datasources.json.JsonFileFormat")
.load(pathToJSONResource)
streamingDF.writeStream
.outputMode(outputMode)
.format(destination)
.option("checkpointLocation", checkpointLocation)
.start(indexAndDocType)
.awaitTermination()
// Stop the session
spark.stop()
}
}
If I can't use the ES-Hadoop libraries is there another way I can go about ingesting JSON into ES from HDFS?

Spark S3 null uri host

val spark = SparkSession.builder
.appName(appName)
.config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.config("hive.exec.max.dynamic.partitions", 5000)
.config("hive.exec.max.dynamic.partitions.pernode", 5000)
.enableHiveSupport()
.master("local[2]")
.getOrCreate()
spark
.sparkContext
.hadoopConfiguration
.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark.read.json("s3a:///bucketname/foldername/").inputFiles
Raises the following exception
Exception in thread "main" java.lang.NullPointerException: null uri host.
at java.util.Objects.requireNonNull(Objects.java:228)
at org.apache.hadoop.fs.s3native.S3xLoginHelper.buildFSURI(S3xLoginHelper.java:73)
at org.apache.hadoop.fs.s3a.S3AFileSystem.setUri(S3AFileSystem.java:470)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:235)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:391)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:325)
I have verified that I am able to read from the bucket and have the correct permissions.
Apparently I was missing the bucket name from the path. Also used s3a:// instead of s3a:///

Hive is not accessible via Spark In Kerberos Environment : Client cannot authenticate via:[TOKEN, KERBEROS]

Hi All, I'm running Spark(2.4.4) in kerberos environment, I've written a code to query Hive Table Via Spark. I am doing kinit also in spark-submit command, but still i'm facing
java.io.IOException:
org.apache.hadoop.security.AccessControlException: Client cannot authenticate via:[TOKEN, KERBEROS];
Here is My code:-
#transient lazy val spark: SparkSession = getSparkSession()
def getSparkSession(): SparkSession = {
log.info("Creating spark session")
var sparkBuilder: SparkSession.Builder = SparkSession.builder().
master("local[*]").
appName("Query Hive Via Spark").
config("hive.exec.scratchdir", "/tmp/hive").enableHiveSupport().
config("hive.exec.dynamic.partition", "true").
config("hive.exec.dynamic.partition.mode", "nonstrict").
config("hive.exec.max.dynamic.partitions", "1000")
#transient lazy val spark: SparkSession = sparkBuilder.getOrCreate()
registerUdfs(spark)
spark.sparkContext.setLogLevel(logLevel)
spark
}
Code to Access Hive Tables via Spark Sql.
val resultDF= spark.sql(s"SELECT count(*) AS cnt FROM brl_in_cash.cash_in_incoming_data WHERE insert_date='20200821'")
resultDF.printSchema()
resultDF.show(false)
I am executing a shell script for spark-submit where i am doing kinit and also passing --principal $KERBEROS_PRINCIPAL --keytab $KERBEROS_KEYTAB .
Spark-submit Command :-
spark-submit --master yarn --deploy-mode cluster \
--verbose \
--name ${appName} \
--principal $KERBEROS_PRINCIPAL \
--keytab $KERBEROS_KEYTAB \
--driver-memory 4g \
--executor-memory 4g \
--executor-cores 2 \
--files ${hiveSite.xml} \
--conf spark.hadoop.yarn.timeline-service.enabled=false \
--conf spark.hadoop.yarn.client.failover-proxy-provider=org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider \
--conf spark.security.credentials.EsServiceCredentialProvider.enabled=false \
--class com.dpk.hive.HiveViaSpark "${jarPath}"
Error Log :-
20/08/26 13:34:17 INFO TezClient: Failed to retrieve AM Status via proxy
com.google.protobuf.ServiceException: java.io.IOException: Failed on local exception: java.io.IOException: org.apache.hadoop.security.AccessControlException: Client cannot authenticate via:[TOKEN, KERBEROS]; Host Details : local host is: "dfghcv012.global.xyz.com/10.7.1.52"; destination host is: "dfghcv013.global.xyz.com":43890;
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:243)
at com.sun.proxy.$Proxy36.getAMStatus(Unknown Source)
at org.apache.tez.client.TezClient.getAppMasterStatus(TezClient.java:618)
at org.apache.tez.client.TezClient.waitTillReady(TezClient.java:697)
at org.apache.hadoop.hive.ql.exec.tez.TezSessionState.open(TezSessionState.java:205)
at org.apache.hadoop.hive.ql.exec.tez.TezSessionState.open(TezSessionState.java:116)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:532)
at org.apache.spark.sql.hive.client.HiveClientImpl.newState(HiveClientImpl.scala:183)
at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:117)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:271)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:384)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:286)
at org.apache.spark.sql.hive.HiveExternalCatalog.client$lzycompute(HiveExternalCatalog.scala:66)
at org.apache.spark.sql.hive.HiveExternalCatalog.client(HiveExternalCatalog.scala:65)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply$mcZ$sp(HiveExternalCatalog.scala:215)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:215)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:215)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:214)
at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:114)
at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:102)
at org.apache.spark.sql.internal.SharedState.globalTempViewManager$lzycompute(SharedState.scala:141)
at org.apache.spark.sql.internal.SharedState.globalTempViewManager(SharedState.scala:136)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$2.apply(HiveSessionStateBuilder.scala:55)
at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anonfun$2.apply(HiveSessionStateBuilder.scala:55)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.globalTempViewManager$lzycompute(SessionCatalog.scala:91)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.globalTempViewManager(SessionCatalog.scala:91)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.isTemporaryTable(SessionCatalog.scala:736)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.isRunningDirectlyOnFiles(Analyzer.scala:747)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.resolveRelation(Analyzer.scala:681)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:713)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:706)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$apply$1.apply(AnalysisHelper.scala:90)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$apply$1.apply(AnalysisHelper.scala:90)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1.apply(AnalysisHelper.scala:89)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1.apply(AnalysisHelper.scala:86)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.resolveOperatorsUp(AnalysisHelper.scala:86)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$1.apply(AnalysisHelper.scala:87)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$1.apply(AnalysisHelper.scala:87)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:329)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1.apply(AnalysisHelper.scala:87)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1.apply(AnalysisHelper.scala:86)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.resolveOperatorsUp(AnalysisHelper.scala:86)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$1.apply(AnalysisHelper.scala:87)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1$$anonfun$1.apply(AnalysisHelper.scala:87)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:329)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:327)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1.apply(AnalysisHelper.scala:87)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveOperatorsUp$1.apply(AnalysisHelper.scala:86)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.resolveOperatorsUp(AnalysisHelper.scala:86)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUp(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:706)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:652)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
at scala.collection.immutable.List.foldLeft(List.scala:84)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:127)
at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:121)
at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:106)
at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)
at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
at com.sc.sdm.rt.oa.recon.TestConnection$.main(TestConnection.scala:34)
at com.sc.sdm.rt.oa.recon.TestConnection.main(TestConnection.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:684)
Caused by: java.io.IOException: Failed on local exception: java.io.IOException: org.apache.hadoop.security.AccessControlException: Client cannot authenticate via:[TOKEN, KERBEROS]; Host Details : local host is: "dfghcv012.global.xyz.com/10.7.1.52"; destination host is: "dfghcv013.global.xyz.com":43890;
at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:776)
at org.apache.hadoop.ipc.Client.call(Client.java:1479)
at org.apache.hadoop.ipc.Client.call(Client.java:1412)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229)
... 91 more
Any Help is Appreciated!!
I was able to resolve this.
I removed
--files hive-site.xml
added spark configuration
--conf spark.security.credentials.hadoopfs.enabled=true
Above changes worked for me.

I save a DataFrame in Hbase and I get: java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/client/TableDescriptor

I created a project on Apache Spark.
Version:
scala 2.11.8
apache spark 2.3.0
apache hbase 1.2.0
hortonworks shc 1.1.0.3.1.2.0-4 (the hortonworks connector)
I need to save a simple DataFrame in an HBase table. For this I started HBase 1.2.0 in Docker container (https://github.com/zhao-y/docker-hbase-pseudo) and created the following table:
$ hbase(main):002:0> create "table1", "cf1", "cf2", "cf3", "cf4", "cf5", "cf6", "cf7", "cf8"
$ 0 row (s) in 1.4440 seconds
To save a DataFrame in Hbase I use: https://github.com/hortonworks-spark/shc
I declared the catalog exactly as in the example
I created a catalog-based dataframe
I tried to save dataframe in hbase as in example:
dataFrame.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
Code:
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.junit.Test
class SparkTest {
case class HBaseRecord(
col0: String,
col1: Boolean,
col2: Double,
col3: Float,
col4: Int,
col5: Long,
col6: Short,
col7: String,
col8: Byte)
object HBaseRecord {
def apply(i: Int, t: String): HBaseRecord = {
val s = s"""row${"%03d".format(i)}"""
HBaseRecord(s,
i % 2 == 0,
i.toDouble,
i.toFloat,
i,
i.toLong,
i.toShort,
s"String$i: $t",
i.toByte)
}
}
#Test
def bar(): Unit = {
val sparkSession = SparkSession.builder
.appName("SparkTest")
.master("local[*]")
.config("spark.testing.memory", 2147480000)
.getOrCreate()
val data = (0 to 255).map { i => HBaseRecord(i, "extra") }
val dataFrame = sparkSession.createDataFrame(data)
dataFrame.show
dataFrame.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
}
}
Error:
java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/client/TableDescriptor
at org.apache.spark.sql.execution.datasources.hbase.DefaultSource.createRelation(HBaseRelation.scala:63)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
at SparkTest.bar(SparkTest.scala:56)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:59)
at org.junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java:98)
at org.junit.internal.runners.MethodRoadie$2.run(MethodRoadie.java:79)
at org.junit.internal.runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java:87)
at org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:77)
at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:42)
at org.junit.internal.runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
at org.junit.internal.runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:51)
at org.junit.internal.runners.JUnit4ClassRunner$1.run(JUnit4ClassRunner.java:44)
at org.junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java:27)
at org.junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:37)
at org.junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java:42)
at org.junit.runner.JUnitCore.run(JUnitCore.java:130)
at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
at com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47)
at com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242)
at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.client.TableDescriptor
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 41 more
val sparkSession = SparkSession.builder
.appName("SparkTest")
.master("local[*]")
.config("spark.testing.memory", 2147480000)
.getOrCreate()
means you are running that in local and your hbase client jar is missing. (if its there in classpath then you can change the scope to runtime rather than compile)
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.1.4</version>
</dependency>
if you are using intellij to run locally, you can see hbase client jar is present in the .iml file.
normal way of runnning in cluster or client modes(not local) would be hbase claasspath add it to
export HBASE_CLASSPATH=$HBASE_CLASSPATH:`hbase classpath`
which will add all the hbase jars in to the classpath
to see/print all the jars in classpath below will be helpful to understand which jars in your classpath.
def urlsinclasspath(cl: ClassLoader): Array[java.net.URL] = cl match {
case null => Array()
case u: java.net.URLClassLoader => u.getURLs() ++ urlsinclasspath(cl.getParent)
case _ => urlsinclasspath(cl.getParent)
}
Caller would be...
val urls = urlsinclasspath(getClass.getClassLoader).foreach(println)

Get a java.lang.LinkageError: ClassCastException when use spark sql hivesql on yarn

This is the driver i upload to yarn-cluster:
package com.baidu.spark.forhivetest
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.hive._
import org.apache.spark.SparkContext
object ForTest {
def main(args : Array[String]){
val sc = new SparkContext()
val sqlc = new SQLContext(sc)
val hivec = new HiveContext(sc)
hivec.sql("CREATE TABLE IF NOT EXISTS newtest (time TIMESTAMP,word STRING,current_city_name STRING,content_src_name STRING,content_name STRING)")
val schema = hivec.table("newtest").schema
println(schema)
}
In hive config file: i set the hive.metastore.uris and hive.metastore.warehouse.dir
On spark-sumbit I do added jars
datanucleus-api-jdo-3.2.6.jar
datanucleus-core-3.2.10.jar
datanucleus-rdbms-3.2.9.jar
Even if I added the mysql-connector-java-5.1.38-bin.jar and spark-1.6.0-bin-hadoop2.6/lib/guava-14.0.1.jar , I still get this error!
But when i run this spark on ide it works successly!
Hope someone can help me ! thx a lot!
This is the error information:
java.lang.LinkageError: ClassCastException: attempting to castjar:file:/mnt/hadoop/yarn/local/filecache/18/spark-assembly-1.6.0-hadoop2.6.0.jar!/javax/ws/rs/ext/RuntimeDelegate.classtojar:file:/mnt/hadoop/yarn/local/filecache/18/spark-assembly-1.6.0-hadoop2.6.0.jar!/javax/ws/rs/ext/RuntimeDelegate.class
at javax.ws.rs.ext.RuntimeDelegate.findDelegate(RuntimeDelegate.java:116)
at javax.ws.rs.ext.RuntimeDelegate.getInstance(RuntimeDelegate.java:91)
at javax.ws.rs.core.MediaType.<clinit>(MediaType.java:44)
at com.sun.jersey.core.header.MediaTypes.<clinit>(MediaTypes.java:64)
at com.sun.jersey.core.spi.factory.MessageBodyFactory.initReaders(MessageBodyFactory.java:182)
at com.sun.jersey.core.spi.factory.MessageBodyFactory.initReaders(MessageBodyFactory.java:175)
at com.sun.jersey.core.spi.factory.MessageBodyFactory.init(MessageBodyFactory.java:162)
at com.sun.jersey.api.client.Client.init(Client.java:342)
at com.sun.jersey.api.client.Client.access$000(Client.java:118)
at com.sun.jersey.api.client.Client$1.f(Client.java:191)
at com.sun.jersey.api.client.Client$1.f(Client.java:187)
at com.sun.jersey.spi.inject.Errors.processWithErrors(Errors.java:193)
at com.sun.jersey.api.client.Client.<init>(Client.java:187)
at com.sun.jersey.api.client.Client.<init>(Client.java:170)
at org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl.serviceInit(TimelineClientImpl.java:268)
at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163)
at org.apache.hadoop.hive.ql.hooks.ATSHook.<init>(ATSHook.java:67)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at java.lang.Class.newInstance(Class.java:374)
at org.apache.hadoop.hive.ql.hooks.HookUtils.getHooks(HookUtils.java:60)
at org.apache.hadoop.hive.ql.Driver.getHooks(Driver.java:1309)
at org.apache.hadoop.hive.ql.Driver.getHooks(Driver.java:1293)
at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1347)
at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1195)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1049)
at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$runHive$1.apply(ClientWrapper.scala:484)
at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$runHive$1.apply(ClientWrapper.scala:473)
at org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$withHiveState$1.apply(ClientWrapper.scala:279)
at org.apache.spark.sql.hive.client.ClientWrapper.liftedTree1$1(ClientWrapper.scala:226)
at org.apache.spark.sql.hive.client.ClientWrapper.retryLocked(ClientWrapper.scala:225)
at org.apache.spark.sql.hive.client.ClientWrapper.withHiveState(ClientWrapper.scala:268)
at org.apache.spark.sql.hive.client.ClientWrapper.runHive(ClientWrapper.scala:473)
at org.apache.spark.sql.hive.client.ClientWrapper.runSqlHive(ClientWrapper.scala:463)
at org.apache.spark.sql.hive.HiveContext.runSqlHive(HiveContext.scala:605)
at org.apache.spark.sql.hive.execution.HiveNativeCommand.run(HiveNativeCommand.scala:33)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:58)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:70)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:55)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:145)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:130)
at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:817)
at com.baidu.spark.forhivetest.ForTest$.main(ForTest.scala:12)
at com.baidu.spark.forhivetest.ForTest.main(ForTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:542)
16/03/22 17:04:32 INFO yarn.ApplicationMaster: Final app status: FAILED, exitCode: 15, (reason: User class threw exception: java.lang.LinkageError: ClassCastException: attempting to castjar:file:/mnt/hadoop/yarn/local/filecache/18/spark-assembly-1.6.0-hadoop2.6.0.jar!/javax/ws/rs/ext/RuntimeDelegate.classtojar:file:/mnt/hadoop/yarn/local/filecache/18/spark-assembly-1.6.0-hadoop2.6.0.jar!/javax/ws/rs/ext/RuntimeDelegate.class)
This has to do with your class path. Try not to build a fat jar.

Resources