Getting NullPointerException (Name is null) when querying a Glue table using Spark SQL on EMR - apache-spark

I've set up an AWS EMR with Spark and Zeppelin and with AWS Glue catalog as the metastore for Hive. I used this instruction: https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-glue.html
It seems that the EMR cluster and Zeppeling are working.
When I run the following paragraph:
%sql
show databases
It works and outputs my databases set up in Glue.
However, when I try to query any of the tables, this way:
%sql
select * from `fus-bear-parquet-db-prod`.fus_bear_parquet_prod limit 10
I get the following exception:
java.lang.NullPointerException: Name is null
at java.lang.Enum.valueOf(Enum.java:236)
at org.apache.hadoop.hive.ql.metadata.Table.getTableType(Table.java:401)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getTableOption$1$$anonfun$apply$11.apply(HiveClientImpl.scala:394)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getTableOption$1$$anonfun$apply$11.apply(HiveClientImpl.scala:373)
at scala.Option.map(Option.scala:146)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getTableOption$1.apply(HiveClientImpl.scala:373)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getTableOption$1.apply(HiveClientImpl.scala:371)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:290)
at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:231)
at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:230)
at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:273)
at org.apache.spark.sql.hive.client.HiveClientImpl.getTableOption(HiveClientImpl.scala:371)
at org.apache.spark.sql.hive.client.HiveClient$class.getTable(HiveClient.scala:75)
at org.apache.spark.sql.hive.client.HiveClientImpl.getTable(HiveClientImpl.scala:79)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$getRawTable$1.apply(HiveExternalCatalog.scala:118)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$getRawTable$1.apply(HiveExternalCatalog.scala:118)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
at org.apache.spark.sql.hive.HiveExternalCatalog.getRawTable(HiveExternalCatalog.scala:117)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$getTable$1.apply(HiveExternalCatalog.scala:675)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$getTable$1.apply(HiveExternalCatalog.scala:675)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
at org.apache.spark.sql.hive.HiveExternalCatalog.getTable(HiveExternalCatalog.scala:674)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.lookupRelation(SessionCatalog.scala:667)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveRelations$$lookupTableFromCatalog(Analyzer.scala:646)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.resolveRelation(Analyzer.scala:601)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:631)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$8.applyOrElse(Analyzer.scala:624)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:62)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:62)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:61)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:59)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:624)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:570)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
at scala.collection.immutable.List.foldLeft(List.scala:84)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:69)
at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:67)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:50)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:67)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:632)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:691)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.zeppelin.spark.SparkSqlInterpreter.interpret(SparkSqlInterpreter.java:116)
at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:97)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:498)
at org.apache.zeppelin.scheduler.Job.run(Job.java:175)
at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:139)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Appreciate any help.

I will rephrase #Andrey Cheptsov's comment as an answer as this is solving the problem and this way will be more prominent for others facing the same issue. Thanks Andrey!
The problem is caused by the TableType (EXTERNAL_TABLE, VIRTUAL_VIEW, etc.) property not being set for the table. It's set automatically when creating tables from Athena, but can get forgotten when using Cloudformation or Terraform.
To solve the problem make sure you add the TableType property to you table and re-try the query.

If you are using the CDK I could fix that by using the table_type
aws_glue.CfnTable.TableInputProperty(description="My Description",name="my_table_name",table_type="EXTERNAL_TABLE",...,)
Using only CloudFormation:
"GlueTable": {
"Type": "AWS::Glue::Table",
"Properties": {
"CatalogId": {
"Ref": "AWS::AccountId"
},
"DatabaseName": "my_data_base_name",
"TableInput": {
"Description": "My Description",
"Name": "my_table_name",
...
"TableType": "EXTERNAL_TABLE"
}
}
}

Related

Issue while reading delta file placed under wasb storage from mapr cluster

I am trying to read delta format file from azure storage using code below in jupyter notebook which is running in mapr cluster.
when i am running this code it is throwing issue that java.lang.NoSuchMethodException: org.apache.hadoop.fs.azure.NativeAzureFileSystem.(java.net.URI, org.apache.hadoop.conf.Configuration)
%%configure -f
{ "conf": {"spark.jars.packages": "io.delta:delta-core_2.11:0.5.0,org.apache.hadoop:hadoop-azure:3.3.1"}, "kind": "spark",
"driverMemory" : "4g", "executorMemory": "2g", "executorCores": 3, "numExecutors":4}
val storage_account_name_dim = "storageAcc"
val sas_access_key_dim = "saskey"
spark.conf.set("spark.delta.logStore.class","org.apache.spark.sql.delta.storage.AzureLogStore")
spark.conf.set("fs.AbstractFileSystem.wasbs.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set(s"fs.azure.sas.dsr.$storage_account_name_dim.blob.core.windows.net", sas_access_key_dim)
val refdf = spark.read.format("delta").load("wasbs://dsr#storageAcc.blob.core.windows.net/dim/ds/DS_CUSTOM_WAG_VENDOR_UPC")
refdf.show(1)
it is giving me below issue
java.lang.RuntimeException: java.lang.NoSuchMethodException: org.apache.hadoop.fs.azure.NativeAzureFileSystem.<init>(java.net.URI, org.apache.hadoop.conf.Configuration)
at org.apache.hadoop.fs.AbstractFileSystem.newInstance(AbstractFileSystem.java:135)
at org.apache.hadoop.fs.AbstractFileSystem.createFileSystem(AbstractFileSystem.java:164)
at org.apache.hadoop.fs.AbstractFileSystem.get(AbstractFileSystem.java:249)
at org.apache.hadoop.fs.FileContext$2.run(FileContext.java:334)
at org.apache.hadoop.fs.FileContext$2.run(FileContext.java:331)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1669)
at org.apache.hadoop.fs.FileContext.getAbstractFileSystem(FileContext.java:331)
at org.apache.hadoop.fs.FileContext.getFileContext(FileContext.java:448)
at org.apache.spark.sql.delta.storage.HDFSLogStore.getFileContext(HDFSLogStore.scala:53)
at org.apache.spark.sql.delta.storage.HDFSLogStore.listFrom(HDFSLogStore.scala:137)
at org.apache.spark.sql.delta.Checkpoints$class.findLastCompleteCheckpoint(Checkpoints.scala:174)
at org.apache.spark.sql.delta.DeltaLog.findLastCompleteCheckpoint(DeltaLog.scala:58)
at org.apache.spark.sql.delta.Checkpoints$class.loadMetadataFromFile(Checkpoints.scala:156)
at org.apache.spark.sql.delta.Checkpoints$class.loadMetadataFromFile(Checkpoints.scala:150)
at org.apache.spark.sql.delta.Checkpoints$class.loadMetadataFromFile(Checkpoints.scala:150)
at org.apache.spark.sql.delta.Checkpoints$class.loadMetadataFromFile(Checkpoints.scala:150)
at org.apache.spark.sql.delta.Checkpoints$class.lastCheckpoint(Checkpoints.scala:133)
at org.apache.spark.sql.delta.DeltaLog.lastCheckpoint(DeltaLog.scala:58)
at org.apache.spark.sql.delta.DeltaLog.<init>(DeltaLog.scala:139)
at org.apache.spark.sql.delta.DeltaLog$$anon$3$$anonfun$call$1$$anonfun$apply$10.apply(DeltaLog.scala:744)
at org.apache.spark.sql.delta.DeltaLog$$anon$3$$anonfun$call$1$$anonfun$apply$10.apply(DeltaLog.scala:744)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:194)
at org.apache.spark.sql.delta.DeltaLog$$anon$3$$anonfun$call$1.apply(DeltaLog.scala:743)
at org.apache.spark.sql.delta.DeltaLog$$anon$3$$anonfun$call$1.apply(DeltaLog.scala:743)
at com.databricks.spark.util.DatabricksLogging$class.recordOperation(DatabricksLogging.scala:77)
at org.apache.spark.sql.delta.DeltaLog$.recordOperation(DeltaLog.scala:671)
at org.apache.spark.sql.delta.metering.DeltaLogging$class.recordDeltaOperation(DeltaLogging.scala:103)
at org.apache.spark.sql.delta.DeltaLog$.recordDeltaOperation(DeltaLog.scala:671)
at org.apache.spark.sql.delta.DeltaLog$$anon$3.call(DeltaLog.scala:742)
at org.apache.spark.sql.delta.DeltaLog$$anon$3.call(DeltaLog.scala:740)
at com.google.common.cache.LocalCache$LocalManualCache$1.load(LocalCache.java:4792)
at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2257)
at com.google.common.cache.LocalCache.get(LocalCache.java:4000)
at com.google.common.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
at org.apache.spark.sql.delta.DeltaLog$.apply(DeltaLog.scala:740)
at org.apache.spark.sql.delta.DeltaLog$.forTable(DeltaLog.scala:712)
at org.apache.spark.sql.delta.sources.DeltaDataSource.createRelation(DeltaDataSource.scala:169)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
... 50 elided
Caused by: java.lang.NoSuchMethodException: org.apache.hadoop.fs.azure.NativeAzureFileSystem.<init>(java.net.URI, org.apache.hadoop.conf.Configuration)
at java.lang.Class.getConstructor0(Class.java:3082)
at java.lang.Class.getDeclaredConstructor(Class.java:2178)
at org.apache.hadoop.fs.AbstractFileSystem.newInstance(AbstractFileSystem.java:129)
... 95 more
same code is working for parquet file , any help will be much appreciated
I Resolved it by adding spark.delta.logStore.class in conf parameter
{ "conf": {"spark.jars.packages": "io.delta:delta-core_2.11:0.5.0,org.apache.hadoop:hadoop-azure:3.3.1","spark.delta.logStore.class":"org.apache.spark.sql.delta.storage.AzureLogStore"}, "kind": "spark","driverMemory" : "4g", "executorMemory": "2g", "executorCores": 3, "numExecutors":4}

Spark-Sql can not access Hive table on Aliyun(Alibaba Cloud) OSS

My hive external table was stored on Aliyun OSS. Hive can execute SQL: select * from log_imp_v2 where dt=20201209 and hour=10 limit 10;
However in Spark-SQL this SQL failed with errors below:
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2597)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3269)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3301)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.spark.sql.hive.DetermineTableStats$$anonfun$apply$2.applyOrElse(HiveStrategies.scala:125)
at org.apache.spark.sql.hive.DetermineTableStats$$anonfun$apply$2.applyOrElse(HiveStrategies.scala:117)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:288)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.hive.DetermineTableStats.apply(HiveStrategies.scala:117)
at org.apache.spark.sql.hive.DetermineTableStats.apply(HiveStrategies.scala:116)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.ArrayBuffer.foldLeft(ArrayBuffer.scala:48)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:124)
at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:118)
at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:103)
at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:694)
at org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:62)
at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:364)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376)
at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:272)
at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:904)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2501)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2595)
... 78 more
I have tried solution on this [site] (https://github.com/AlibabaCloudDocs/oss/blob/master/cn.zh-CN/%E6%9C%80%E4%BD%B3%E5%AE%9E%E8%B7%B5/%E6%95%B0%E6%8D%AE%E5%A4%84%E7%90%86%E4%B8%8E%E5%88%86%E6%9E%90/HDP2.6%20Hadoop%E5%A6%82%E4%BD%95%E8%AF%BB%E5%86%99OSS.md), nothing changed.
I have tried this :https://hadoop.apache.org/docs/r2.9.2/hadoop-aliyun/tools/hadoop-aliyun/index.html. nothing changed.
By the way, I use Ambari to install Hadoop 3.1.1.
versions:
Spark2 2.3.2
HDFS 3.1.1
YARN 3.1.1
Hive 3.1.0

How to authenticate with BigQuery from Apache Spark (pyspark)?

I have create a client id and client secret for my bigquery project, but I don't know how to use those to successfully save a dataframe from a pyspark script to my bigquery table. My python code below results in the following error. Is there a way I can connect to BigQuery using the save options on the pyspark dataframe?
Code
df.write \
.format("bigquery") \
.option("client_id", "<MY_CLIENT_ID>") \
.option("client_secret", "<MY_CLIENT_SECRET>") \
.option("project", "bigquery-project-id") \
.option("table", "dataset.table") \
.save()
Error
py4j.protocol.Py4JJavaError: An error occurred while calling o93.save.
:
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryException:
400 Bad Request { "error": "invalid_grant", "error_description":
"Bad Request" } at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.translate(HttpBigQueryRpc.java:106)
at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.getTable(HttpBigQueryRpc.java:268)
at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl$17.call(BigQueryImpl.java:664)
at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl$17.call(BigQueryImpl.java:661)
at
com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.retrying.DirectRetryingExecutor.submit(DirectRetryingExecutor.java:105)
at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.RetryHelper.run(RetryHelper.java:76)
at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.RetryHelper.runWithRetries(RetryHelper.java:50)
at
com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl.getTable(BigQueryImpl.java:660)
at
com.google.cloud.spark.bigquery.BigQueryInsertableRelation.getTable(BigQueryInsertableRelation.scala:68)
at
com.google.cloud.spark.bigquery.BigQueryInsertableRelation.exists(BigQueryInsertableRelation.scala:54)
at
com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:86)
at
org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at
org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at
org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at
org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
at
org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at
org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
at
org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
at
org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at
py4j.Gateway.invoke(Gateway.java:282) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:238) at
java.lang.Thread.run(Thread.java:748) Caused by:
com.google.cloud.spark.bigquery.repackaged.com.google.api.client.http.HttpResponseException:
400 Bad Request { "error": "invalid_grant", "error_description":
"Bad Request" }
From spark-bigquery-connector :
How do I authenticate outside GCE / Dataproc?
Use a service account JSON key and GOOGLE_APPLICATION_CREDENTIALS as
described here.
Credentials can also be provided explicitly either as a parameter or
from Spark runtime configuration. It can be passed in as a
base64-encoded string directly, or a file path that contains the
credentials (but not both).
So you should be using this:
spark.read.format("bigquery").option("credentialsFile", "</path/to/key/file>")

Provider org.apache.spark.sql.hive.orc.DefaultSource could not be instantiated

I have a simple spark job that is reading data from Hive and some from db2, does some calculations and puts the results in db2. In the line of code where I try to read data from db2, I see the following error :
Exception in thread "main" java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.hive.orc.DefaultSource could not be instantiated
at java.util.ServiceLoader.fail(ServiceLoader.java:232)
at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:533)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:89)
at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:89)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:304)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at com.ibm.sifs.trade.framework.persistence.TradePersistence.persistTradeSummary(TradePersistence.java:697)
at com.ibm.sifs.trade.summaries.IntraDaySummary.main(IntraDaySummary.java:136)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:782)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.VerifyError: Bad return type
Exception Details:
Location:
org/apache/spark/sql/hive/orc/DefaultSource.createRelation(Lorg/apache/
spark/sql/SQLContext;
[Ljava/lang/String;Lscala/Option;Lscala/Option;Lscala/collection/immutable
/Map;)Lorg/apache/spark/sql/sources/HadoopFsRelation; #35: areturn
Reason:
Type 'org/apache/spark/sql/hive/orc/OrcRelation' (current frame, stack[0]) is not assignable to 'org/apache/spark/sql/sources/HadoopFsRelation' (from method signature)
Current Frame:
bci: #35
flags: { }
locals: { 'org/apache/spark/sql/hive/orc/DefaultSource', 'org/apache/spark/sql/SQLContext', '[Ljava/lang/String;', 'scala/Option', 'scala/Option', 'scala/collection/immutable/Map' }
stack: { 'org/apache/spark/sql/hive/orc/OrcRelation' }
Bytecode:
0x0000000: b200 1c2b c100 1ebb 000e 592a b700 22b6
0x0000010: 0026 bb00 2859 2c2d b200 2d19 0419 052b
0x0000020: b700 30b0
at java.lang.Class.getDeclaredConstructors0(Native Method)
at java.lang.Class.privateGetDeclaredConstructors(Class.java:2671)
at java.lang.Class.getConstructor0(Class.java:3075)
at java.lang.Class.newInstance(Class.java:412)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
... 27 more
These are the jars in my spark-submit command :
--files /etc/spark2/2.6.4.0-91/0/hive-site.xml,/etc/spark2/2.6.4.0-91/0/hbase-site.xml --jars /usr/hdp/current/spark2-client/jars/spark-yarn_2.11-2.2.0.2.6.4.0-91.jar,/usr/hdp/2.6.4.0-91/hbase/lib/hbase-server.jar,/usr/hdp/2.6.4.0-91/hbase/lib/hbase-protocol.jar,/usr/hdp/2.6.4.0-91/hbase/lib/hbase-client.jar,/usr/hdp/2.6.4.0-91/hbase/lib/hbase-common.jar --driver-java-options "-Dlog4j.configuration=file:/etc/spark2/2.6.4.0-91/0/log4j.properties" --conf "spark.driver.extraClassPath=/home/sifsuser/lib/*:/usr/hdp/2.6.4.0-91/hive/lib/*:/usr/hdp/current/spark2-client/jars/*:/usr/hdp/2.6.4.0-91/hbase/lib/*"
This is due to spark version mismatch. Check if you have same version of spark in both your local and also in your project bundle.
I ran into this issue when I had spark 2.2 at my mac and 2.4 at my maven project. When I matched the version, everything worked fine.

NullPointerException when running Apache.spark

I am trying to run a query over redshift to extract into a dataframe, same query works on spark 2.0.2, but since databricks deprecate this old version, I moved to spark 2.2.1, and I am getting the following exception with the new environment.
Any help is appreciated.
In short, the NullPointerException is coming from
java.lang.NullPointerException at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter.write(UnsafeRowWriter.java:210) at".
I tried to disable sparkConf.set("spark.sql.codegen.wholeStage","false") as well, but it still does not work.
Does anyone know how to fix this?
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1683)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1671)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1670)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1670)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:931)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:931)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:931)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1903)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1854)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1842)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:733)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
at org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:231)
at org.apache.spark.sql.execution.collect.Collector.collect(Collector.scala:241)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:64)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:70)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:45)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectResult(Dataset.scala:2484)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3037)
at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2453)
at org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2453)
at org.apache.spark.sql.Dataset$$anonfun$59.apply(Dataset.scala:3021)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:89)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:127)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3020)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2453)
at com.axs.dataplatform.redshift.merge.RedshiftMerger.merge(RedshiftMerger.scala:30)
at com.axs.dataplatform.flashseats.segmentation.operations.Merge$.doMerge(Merge.scala:36)
at com.axs.dataplatform.flashseats.segmentation.FlashseatsSegmentation$$anonfun$2$$anonfun$apply$1$$anonfun$apply$2.apply(FlashseatsSegmentation.scala:99)
at com.axs.dataplatform.flashseats.segmentation.FlashseatsSegmentation$$anonfun$2$$anonfun$apply$1$$anonfun$apply$2.apply(FlashseatsSegmentation.scala:99)
at scala.collection.immutable.List.foreach(List.scala:381)
at com.axs.dataplatform.flashseats.segmentation.FlashseatsSegmentation$$anonfun$2$$anonfun$apply$1.apply(FlashseatsSegmentation.scala:99)
at com.axs.dataplatform.flashseats.segmentation.FlashseatsSegmentation$$anonfun$2$$anonfun$apply$1.apply(FlashseatsSegmentation.scala:97)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.parallel.ParIterableLike$Foreach.leaf(ParIterableLike.scala:972)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply$mcV$sp(Tasks.scala:49)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$$anonfun$tryLeaf$1.apply(Tasks.scala:48)
at scala.collection.parallel.Task$class.tryLeaf(Tasks.scala:51)
at scala.collection.parallel.ParIterableLike$Foreach.tryLeaf(ParIterableLike.scala:969)
at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask$class.compute(Tasks.scala:152)
at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:443)
at scala.concurrent.forkjoin.RecursiveAction.exec(RecursiveAction.java:160)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by a java.lang.NullPointerException:
at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter.write(UnsafeRowWriter.java:210)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:423)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:423)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:349)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
When I set the spark.sql.codegen.wholeStage to false, I get another NullPointerException:
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificSafeProjection.apply(Unknown Source)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:462)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$doExecute$1$$anonfun$9.apply(HashAggregateExec.scala:132)
at org.apache.spark.sql.execution.aggregate.HashAggregateExec$$anonfun$doExecute$1$$anonfun$9.apply(HashAggregateExec.scala:130)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$26.apply(RDD.scala:855)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1$$anonfun$apply$26.apply(RDD.scala:855)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:332)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:296)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:332)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:296)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
Yes, I did, did you encounter the same problem?
Here's the solution:
def setNullableStateForAllColumns( df: DataFrame, nullable: Boolean) = {
// get schema
val schema = df.schema
StructType(schema.map {
case StructField( c, t, _, m) ⇒ StructField( c, t, nullable = nullable, m)
})
}
def extractNullableData(sql: String): DataFrame = {
logger.info(s"Extracting data from ${source.conf} with sql:\n$sql")
val tempS3Dir = "s3n://data-platform-temp/tmp/redshift_extract"
val origDf =
context
.read
.format("com.databricks.spark.redshift")
.option("forward_spark_s3_credentials", true)
.option("url", source.jdbcUrlWPass)
.option("jdbcdriver", source.driver)
.option("autoenablessl", "false")
.option("tempdir", tempS3Dir)
.option("query", sql)
.load()
context.read
.format("com.databricks.spark.redshift")
.option("forward_spark_s3_credentials", true)
.option("url", source.jdbcUrlWPass)
.option("jdbcdriver", source.driver)
.option("autoenablessl", "false")
.schema(setNullableStateForAllColumns(origDf, true))
.option("tempdir", tempS3Dir)
.option("query", sql)
.load()
}

Resources