Special characters in Dataproc partition columns - apache-spark

I'm using Spark 3.1.2 on Goole Dataproc image version 2.0.15-debian10 with Dataproc managed metastore version 3.1.2. The following snippet works fine with a GCS backed table mydb.mytable:
from pyspark.sql import Row
spark.createDataFrame([Row(x='a', y=1)]).write.saveAsTable('mydb.mytable',
mode='overwrite',
partitionBy=['x'])
However, when adding a special character to partition column:
spark.createDataFrame([Row(x='ก', y=1)]).write.saveAsTable('mydb.mytable',
mode='overwrite',
partitionBy=['x'])
the operation fails with the following exception:
21/08/03 10:44:27 ERROR hive.ql.metadata.Hive: MetaException(message:Exception thrown when executing query : SELECT DISTINCT 'org.apache.hadoop.hive.metastore.model.MPartition' AS `NUCLEUS_TYPE`,`A0`.`CREATE_TIME`,`A0`.`LAST_ACCESS_TIME`,`A0`.`PART_NAME`,`A0`.`PART_ID` FROM `PARTITIONS` `A0` LEFT OUTER JOIN `TBLS` `B0` ON `A0`.`TBL_ID` = `B0`.`TBL_ID` LEFT OUTER JOIN `DBS` `C0` ON `B0`.`DB_ID` = `C0`.`DB_ID` WHERE `B0`.`TBL_NAME` = ? AND `C0`.`NAME` = ? AND `A0`.`PART_NAME` = ? AND `C0`.`CTLG_NAME` = ?)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$add_partitions_req_result$add_partitions_req_resultStandardScheme.read(ThriftHiveMetastore.java)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$add_partitions_req_result$add_partitions_req_resultStandardScheme.read(ThriftHiveMetastore.java)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$add_partitions_req_result.read(ThriftHiveMetastore.java)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:88)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.recv_add_partitions_req(ThriftHiveMetastore.java:1911)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.add_partitions_req(ThriftHiveMetastore.java:1898)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.add_partitions(HiveMetaStoreClient.java:627)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:173)
at com.sun.proxy.$Proxy46.add_partitions(Unknown Source)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient$SynchronizedHandler.invoke(HiveMetaStoreClient.java:2336)
at com.sun.proxy.$Proxy46.add_partitions(Unknown Source)
at org.apache.hadoop.hive.ql.metadata.Hive.createPartitions(Hive.java:2097)
at org.apache.spark.sql.hive.client.Shim_v0_13.createPartitions(HiveShim.scala:555)
at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$createPartitions$1(HiveClientImpl.scala:609)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$withHiveState$1(HiveClientImpl.scala:291)
at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:224)
at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:223)
at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:273)
at org.apache.spark.sql.hive.client.HiveClientImpl.createPartitions(HiveClientImpl.scala:602)
at org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$createPartitions$1(HiveExternalCatalog.scala:1007)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:102)
at org.apache.spark.sql.hive.HiveExternalCatalog.createPartitions(HiveExternalCatalog.scala:989)
at org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createPartitions(ExternalCatalogWithListener.scala:201)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.createPartitions(SessionCatalog.scala:1050)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.$anonfun$addPartitions$1(ddl.scala:792)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.$anonfun$addPartitions$1$adapted(ddl.scala:774)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.addPartitions(ddl.scala:774)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.run(ddl.scala:672)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:192)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
at org.apache.spark.sql.DataFrameWriter.createTable(DataFrameWriter.scala:753)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:727)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:626)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
The data gets written correctly to GCS but updating the metastore fails. My guess is this is caused by using latin1 encoding in the managed metastore. Also, using spark.write.parquet(...) works (as the metastore is not used).
Is it possible to configure Dataproc to properly handle any UTF-8 values in partition columns? I would like to avoid using any encoding (like URL encoding) in the application logic.

The issue is that Hive Metastore's MySQL schema does not support that character:
PART_NAME varchar(767) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
https://github.com/apache/hive/blob/master/metastore/scripts/upgrade/mysql/hive-schema-2.3.0.mysql.sql#L211
You could reach out to the Hive community to check whether a larger character set is OK: https://github.com/apache/hive#useful-mailing-lists

Related

couldn't save table in azure databricks

During saving dataframe to tables in Azure Databricks I get error,
val employeesDf = Seq(
("Rafferty", Some(31)), ("Jones", Some(33)), ("Heisenberg", Some(33)),
("Robinson", Some(34)), ("Smith", Some(34)), ("Williams", null)
).toDF("LastName","DepartmentID").write.format("parquet").mode("overwrite").saveAsTable("employ ees_table")
org.apache.spark.sql.AnalysisException:
org.apache.hadoop.hive.ql.metadata.HiveException:
MetaException(message:javax.jdo.JDOUserException: Table
"partition_keys" has been specified with a primary-key to include
column "TBL_ID" but this column is not found in the table. Please
check your column specification. <div
class="ansiout"> at
org.datanucleus.api.jdo.NucleusJDOHelper.getJDOExceptionForNucleusException(NucleusJDOHelper.java:549)
at
org.datanucleus.api.jdo.JDOPersistenceManager.jdoMakePersistent(JDOPersistenceManager.java:732)
at
org.datanucleus.api.jdo.JDOPersistenceManager.makePersistent(JDOPersistenceManager.java:752)
at
org.apache.hadoop.hive.metastore.ObjectStore.createTable(ObjectStore.java:719)
at sun.reflect.GeneratedMethodAccessor441.invoke(Unknown Source) at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RawStoreProxy.invoke(RawStoreProxy.java:108)
at com.sun.proxy.$Proxy32.createTable(Unknown Source) at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_core(HiveMetaStore.java:1261)
at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_with_environment_context(HiveMetaStore.java:1294)
at sun.reflect.GeneratedMethodAccessor439.invoke(Unknown Source) at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105)
at
com.sun.proxy.$Proxy33.create_table_with_environment_context(Unknown
Source) ...
Additionally, I am also getting an error during run an example notebook from databricks, in which create tables from path on dbfs
%sql
DROP TABLE IF EXISTS diamonds;
CREATE TABLE diamonds
USING csv
OPTIONS (path "/databricks-datasets/Rdatasets/data-
001/csv/ggplot2/diamonds.csv", header "true")
Error in SQL statement: AnalysisException:
org.apache.hadoop.hive.ql.metadata.HiveException:
MetaException(message:javax.jdo.JDOUserException: Table
"partition_keys" has been specified with a primary-key to include
column "TBL_ID" but this column is not found in the table. Please
check your column specification. at
org.datanucleus.api.jdo.NucleusJDOHelper.getJDOExceptionForNucleusException(NucleusJDOHelper.java:549)
at
org.datanucleus.api.jdo.JDOPersistenceManager.jdoMakePersistent(JDOPersistenceManager.java:732)
at
org.datanucleus.api.jdo.JDOPersistenceManager.makePersistent(JDOPersistenceManager.java:752)
at
org.apache.hadoop.hive.metastore.ObjectStore.createTable(ObjectStore.java:719)
at sun.reflect.GeneratedMethodAccessor441.invoke(Unknown Source) at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RawStoreProxy.invoke(RawStoreProxy.java:108)
at com.sun.proxy.$Proxy32.createTable(Unknown Source) at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_core(HiveMetaStore.java:1261)
at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_with_environment_context(HiveMetaStore.java:1294)
at sun.reflect.GeneratedMethodAccessor439.invoke(Unknown Source) at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105)
at
com.sun.proxy.$Proxy33.create_table_with_environment_context(Unknown
Source) at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createTable(HiveMetaStoreClient.java:558)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createTable(HiveMetaStoreClient.java:547)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89)
at com.sun.proxy.$Proxy34.createTable(Unknown Source) at
org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:613) at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply$mcV$sp(HiveClientImpl.scala:528)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply(HiveClientImpl.scala:526)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply(HiveClientImpl.scala:526)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:322)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$retryLocked$1.apply(HiveClientImpl.scala:230)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$retryLocked$1.apply(HiveClientImpl.scala:222)
at
org.apache.spark.sql.hive.client.HiveClientImpl.synchronizeOnObject(HiveClientImpl.scala:266)
at
org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:222)
at
org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:305)
at
org.apache.spark.sql.hive.client.HiveClientImpl.createTable(HiveClientImpl.scala:526)
at
org.apache.spark.sql.hive.client.PoolingHiveClient$$anonfun$createTable$1.apply(PoolingHiveClient.scala:286)
at
org.apache.spark.sql.hive.client.PoolingHiveClient$$anonfun$createTable$1.apply(PoolingHiveClient.scala:285)
at
org.apache.spark.sql.hive.client.PoolingHiveClient.withHiveClient(PoolingHiveClient.scala:112)
at
org.apache.spark.sql.hive.client.PoolingHiveClient.createTable(PoolingHiveClient.scala:285)
at
org.apache.spark.sql.hive.HiveExternalCatalog.saveTableIntoHive(HiveExternalCatalog.scala:554)
at
org.apache.spark.sql.hive.HiveExternalCatalog.org$apache$spark$sql$hive$HiveExternalCatalog$$createDataSourceTable(HiveExternalCatalog.scala:461)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply$mcV$sp(HiveExternalCatalog.scala:325)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:298)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:298)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$withClient$1$$anonfun$apply$1.apply(HiveExternalCatalog.scala:141)
at
org.apache.spark.sql.hive.HiveExternalCatalog.org$apache$spark$sql$hive$HiveExternalCatalog$$maybeSynchronized(HiveExternalCatalog.scala:104)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$withClient$1.apply(HiveExternalCatalog.scala:139)
at
com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:345)
at
com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:331)
at
com.databricks.spark.util.SparkDatabricksProgressReporter$.withStatusCode(ProgressReporter.scala:34)
at
org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:137)
at
org.apache.spark.sql.hive.HiveExternalCatalog.createTable(HiveExternalCatalog.scala:298)
at
org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createTable(ExternalCatalogWithListener.scala:99)
at
org.apache.spark.sql.catalyst.catalog.SessionCatalog.createTable(SessionCatalog.scala:349)
at
com.databricks.sql.DatabricksSessionCatalog.createTable(DatabricksSessionCatalog.scala:144)
at
org.apache.spark.sql.execution.command.CreateDataSourceTableCommand.run(createDataSourceTables.scala:118)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:72)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:70)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:81)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:205)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:205)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:3424)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:3419)
at
org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:99)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:228)
at
org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:85)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:158)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3419)
at org.apache.spark.sql.Dataset.(Dataset.scala:205) at
org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:89) at
org.apache.spark.sql.SparkSession.sql(SparkSession.scala:696) at
org.apache.spark.sql.SQLContext.sql(SQLContext.scala:707) at
com.databricks.backend.daemon.driver.SQLDriverLocal$$anonfun$1.apply(SQLDriverLocal.scala:87)
at
com.databricks.backend.daemon.driver.SQLDriverLocal$$anonfun$1.apply(SQLDriverLocal.scala:33)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:392) at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:296) at
com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:33)
at
com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:136)
at
com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$8.apply(DriverLocal.scala:323)
at
com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$8.apply(DriverLocal.scala:303)
at
com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:235)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58) at
com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:230)
at
com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:47)
at
com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:268)
at
com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:47)
at
com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:303)
at
com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:591)
at
com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:591)
at scala.util.Try$.apply(Try.scala:192) at
com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:586)
at
com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:477)
at
com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:544)
at
com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:383)
at
com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:330)
at
com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:216)
at java.lang.Thread.run(Thread.java:748) NestedThrowablesStackTrace:
Table "partition_keys" has been specified with a primary-key to
include column "TBL_ID" but this column is not found in the table.
Please check your column specification.
org.datanucleus.exceptions.NucleusUserException: Table
"partition_keys" has been specified with a primary-key to include
column "TBL_ID" but this column is not found in the table. Please
check your column specification. at
org.datanucleus.store.rdbms.table.ElementContainerTable.applyUserPrimaryKeySpecification(ElementContainerTable.java:217)
at
org.datanucleus.store.rdbms.table.CollectionTable.initialize(CollectionTable.java:240)
at
org.datanucleus.store.rdbms.RDBMSStoreManager$ClassAdder.initializeClassTables(RDBMSStoreManager.java:3283)
at
org.datanucleus.store.rdbms.RDBMSStoreManager$ClassAdder.addClassTablesAndValidate(RDBMSStoreManager.java:3185)
at
org.datanucleus.store.rdbms.RDBMSStoreManager$ClassAdder.run(RDBMSStoreManager.java:2841)
at
org.datanucleus.store.rdbms.AbstractSchemaTransaction.execute(AbstractSchemaTransaction.java:122)
at
org.datanucleus.store.rdbms.RDBMSStoreManager.addClasses(RDBMSStoreManager.java:1605)
at
org.datanucleus.store.AbstractStoreManager.addClass(AbstractStoreManager.java:954)
at
org.datanucleus.store.rdbms.RDBMSStoreManager.getDatastoreClass(RDBMSStoreManager.java:679)
at
org.datanucleus.store.rdbms.RDBMSStoreManager.getPropertiesForGenerator(RDBMSStoreManager.java:2045)
at
org.datanucleus.store.AbstractStoreManager.getStrategyValue(AbstractStoreManager.java:1365)
at
org.datanucleus.ExecutionContextImpl.newObjectId(ExecutionContextImpl.java:3827)
at
org.datanucleus.state.JDOStateManager.setIdentity(JDOStateManager.java:2571)
at
org.datanucleus.state.JDOStateManager.initialiseForPersistentNew(JDOStateManager.java:513)
at
org.datanucleus.state.ObjectProviderFactoryImpl.newForPersistentNew(ObjectProviderFactoryImpl.java:232)
at
org.datanucleus.ExecutionContextImpl.newObjectProviderForPersistentNew(ExecutionContextImpl.java:1414)
at
org.datanucleus.ExecutionContextImpl.persistObjectInternal(ExecutionContextImpl.java:2218)
at
org.datanucleus.ExecutionContextImpl.persistObjectWork(ExecutionContextImpl.java:2065)
at
org.datanucleus.ExecutionContextImpl.persistObject(ExecutionContextImpl.java:1913)
at
org.datanucleus.ExecutionContextThreadedImpl.persistObject(ExecutionContextThreadedImpl.java:217)
at
org.datanucleus.api.jdo.JDOPersistenceManager.jdoMakePersistent(JDOPersistenceManager.java:727)
at
org.datanucleus.api.jdo.JDOPersistenceManager.makePersistent(JDOPersistenceManager.java:752)
at
org.apache.hadoop.hive.metastore.ObjectStore.createTable(ObjectStore.java:719)
at sun.reflect.GeneratedMethodAccessor441.invoke(Unknown Source) at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RawStoreProxy.invoke(RawStoreProxy.java:108)
at com.sun.proxy.$Proxy32.createTable(Unknown Source) at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_core(HiveMetaStore.java:1261)
at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_table_with_environment_context(HiveMetaStore.java:1294)
at sun.reflect.GeneratedMethodAccessor439.invoke(Unknown Source) at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:105)
at
com.sun.proxy.$Proxy33.create_table_with_environment_context(Unknown
Source) at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createTable(HiveMetaStoreClient.java:558)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createTable(HiveMetaStoreClient.java:547)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:89)
at com.sun.proxy.$Proxy34.createTable(Unknown Source) at
org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:613) at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply$mcV$sp(HiveClientImpl.scala:528)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply(HiveClientImpl.scala:526)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createTable$1.apply(HiveClientImpl.scala:526)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:322)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$retryLocked$1.apply(HiveClientImpl.scala:230)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$retryLocked$1.apply(HiveClientImpl.scala:222)
at
org.apache.spark.sql.hive.client.HiveClientImpl.synchronizeOnObject(HiveClientImpl.scala:266)
at
org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:222)
at
org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:305)
at
org.apache.spark.sql.hive.client.HiveClientImpl.createTable(HiveClientImpl.scala:526)
at
org.apache.spark.sql.hive.client.PoolingHiveClient$$anonfun$createTable$1.apply(PoolingHiveClient.scala:286)
at
org.apache.spark.sql.hive.client.PoolingHiveClient$$anonfun$createTable$1.apply(PoolingHiveClient.scala:285)
at
org.apache.spark.sql.hive.client.PoolingHiveClient.withHiveClient(PoolingHiveClient.scala:112)
at
org.apache.spark.sql.hive.client.PoolingHiveClient.createTable(PoolingHiveClient.scala:285)
at
org.apache.spark.sql.hive.HiveExternalCatalog.saveTableIntoHive(HiveExternalCatalog.scala:554)
at
org.apache.spark.sql.hive.HiveExternalCatalog.org$apache$spark$sql$hive$HiveExternalCatalog$$createDataSourceTable(HiveExternalCatalog.scala:461)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply$mcV$sp(HiveExternalCatalog.scala:325)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:298)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:298)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$withClient$1$$anonfun$apply$1.apply(HiveExternalCatalog.scala:141)
at
org.apache.spark.sql.hive.HiveExternalCatalog.org$apache$spark$sql$hive$HiveExternalCatalog$$maybeSynchronized(HiveExternalCatalog.scala:104)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$withClient$1.apply(HiveExternalCatalog.scala:139)
at
com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:345)
at
com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:331)
at
com.databricks.spark.util.SparkDatabricksProgressReporter$.withStatusCode(ProgressReporter.scala:34)
at
org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:137)
at
org.apache.spark.sql.hive.HiveExternalCatalog.createTable(HiveExternalCatalog.scala:298)
at
org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createTable(ExternalCatalogWithListener.scala:99)
at
org.apache.spark.sql.catalyst.catalog.SessionCatalog.createTable(SessionCatalog.scala:349)
at
com.databricks.sql.DatabricksSessionCatalog.createTable(DatabricksSessionCatalog.scala:144)
at
org.apache.spark.sql.execution.command.CreateDataSourceTableCommand.run(createDataSourceTables.scala:118)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:72)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:70)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:81)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:205)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:205)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:3424)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:3419)
at
org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:99)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:228)
at
org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:85)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:158)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3419)
at org.apache.spark.sql.Dataset.(Dataset.scala:205) at
org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:89) at
org.apache.spark.sql.SparkSession.sql(SparkSession.scala:696) at
org.apache.spark.sql.SQLContext.sql(SQLContext.scala:707) at
com.databricks.backend.daemon.driver.SQLDriverLocal$$anonfun$1.apply(SQLDriverLocal.scala:87)
at
com.databricks.backend.daemon.driver.SQLDriverLocal$$anonfun$1.apply(SQLDriverLocal.scala:33)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:392) at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:296) at
com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:33)
at
com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:136)
at
com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$8.apply(DriverLocal.scala:323)
at
com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$8.apply(DriverLocal.scala:303)
at
com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:235)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58) at
com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:230)
at
com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:47)
at
com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:268)
at
com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:47)
at
com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:303)
at
com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:591)
at
com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$tryExecutingCommand$2.apply(DriverWrapper.scala:591)
at scala.util.Try$.apply(Try.scala:192) at
com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:586)
at
com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:477)
at
com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:544)
at
com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:383)
at
com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:330)
at
com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:216)
at java.lang.Thread.run(Thread.java:748) );
I'm able to run the same query without any issue.
This issue looks strange. For a deeper investigation and immediate assistance on this issue, if you have a support plan you may file a support ticket.
For more details, refer "Azure Databricks Quickstart guide".
This error is due to incorrect configuration of the external metastore

How to fix 'org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)'

What I did (Structured Streaming)
> 1.) ./bin/pyspark
> 2.) spark
> 3.) static = spark.read.json("/data/activity-data/")
> 4.) dataSchema = static.schema
> 5.) streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
> 6.) .json("/data/activity-data")
> 7.) activityCounts = streaming.groupBy("gt").count()
Then I got this huge error.
Could you help me how to troubleshoot this?
ERRORs:
org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:110)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$3.applyOrElse(CheckAnalysis.scala:107)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:278)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:278)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:277)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:93)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:105)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at
org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:104)
at
org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:116)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$2.apply(QueryPlan.scala:121)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at
org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:121)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:126)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at
org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:126)
at
org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:93)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:107)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:85)
at
org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at
org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:85)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:95)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:108)
at
org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201)
at
org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)
at
org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
at
org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78) at
org.apache.spark.sql.RelationalGroupedDataset.toDF(RelationalGroupedDataset.scala:65)
at
org.apache.spark.sql.RelationalGroupedDataset.count(RelationalGroupedDataset.scala:237)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at
py4j.Gateway.invoke(Gateway.java:282) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:238) at
java.lang.Thread.run(Thread.java:748)
If you have duplicate keys, you will get this error.
You can also refer to https://issues.apache.org/jira/browse/SPARK-10925

Zeppelin - Spark Interpreter Unable to create hive table by using CTAS (Create Table as Select ...) statement

I am using Zeppelin and trying to create a hive table from another hive table by using CTAS statement
But my query ends up with error always so the table is not getting created. Have found out few posts which says to modify zeppelin configuration but I cannot change any configuration as I don't have permission to do so.
The query which I had executed and the error that I get are given below :
%sql
create table student as select * from student_score
org.apache.hadoop.hive.ql.metadata.HiveException: Unable to alter
table. Invalid method name: 'alter_table_with_cascade' at
org.apache.hadoop.hive.ql.metadata.Hive.alterTable(Hive.java:500) at
org.apache.hadoop.hive.ql.metadata.Hive.alterTable(Hive.java:484) at
org.apache.hadoop.hive.ql.metadata.Hive.loadTable(Hive.java:1668) at
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.spark.sql.hive.client.Shim_v0_14.loadTable(HiveShim.scala:716)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$loadTable$1.apply$mcV$sp(HiveClientImpl.scala:672)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$loadTable$1.apply(HiveClientImpl.scala:672)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$loadTable$1.apply(HiveClientImpl.scala:672)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:283)
at
org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:230)
at
org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:229)
at
org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:272)
at
org.apache.spark.sql.hive.client.HiveClientImpl.loadTable(HiveClientImpl.scala:671)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$loadTable$1.apply$mcV$sp(HiveExternalCatalog.scala:741)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$loadTable$1.apply(HiveExternalCatalog.scala:739)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$loadTable$1.apply(HiveExternalCatalog.scala:739)
at
org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:95)
at
org.apache.spark.sql.hive.HiveExternalCatalog.loadTable(HiveExternalCatalog.scala:739)
at
org.apache.spark.sql.hive.execution.InsertIntoHiveTable.sideEffectResult$lzycompute(InsertIntoHiveTable.scala:323)
at
org.apache.spark.sql.hive.execution.InsertIntoHiveTable.sideEffectResult(InsertIntoHiveTable.scala:170)
at
org.apache.spark.sql.hive.execution.InsertIntoHiveTable.doExecute(InsertIntoHiveTable.scala:347)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at
org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
at
org.apache.spark.sql.hive.execution.CreateHiveTableAsSelectCommand.run(CreateHiveTableAsSelectCommand.scala:92)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at
org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
at
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
at org.apache.spark.sql.Dataset.(Dataset.scala:185) at
org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64) at
org.apache.spark.sql.SparkSession.sql(SparkSession.scala:592) ... 47
elided Caused by: org.apache.thrift.TApplicationException: Invalid
method name: 'alter_table_with_cascade' at
org.apache.thrift.TApplicationException.read(TApplicationException.java:111)
at
org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:71)
at
org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.recv_alter_table_with_cascade(ThriftHiveMetastore.java:1374)
at
org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.alter_table_with_cascade(ThriftHiveMetastore.java:1358)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.alter_table(HiveMetaStoreClient.java:340)
at
org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.alter_table(SessionHiveMetaStoreClient.java:251)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:156)
at com.sun.proxy.$Proxy25.alter_table(Unknown Source) at
org.apache.hadoop.hive.ql.metadata.Hive.alterTable(Hive.java:496)
... 93 more

Spark AVRO S3 read not working for partitioned data

When I read a specific file it works:
val filePath= "s3n://bucket_name/f1/f2/avro/dt=2016-10-19/hr=19/000000"
val df = spark.read.avro(filePath)
But if I point to a folder to read date partitioned data it fails:
val filePath="s3n://bucket_name/f1/f2/avro/dt=2016-10-19/"
I get this error:
Exception in thread "main" org.apache.hadoop.fs.s3.S3Exception: org.jets3t.service.S3ServiceException: S3 HEAD request failed for '/f1%2Ff2%2Favro%2Fdt%3D2016-10-19' - ResponseCode=403, ResponseMessage=Forbidden
at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.handleServiceException(Jets3tNativeFileSystemStore.java:245)
at org.apache.hadoop.fs.s3native.Jets3tNativeFileSystemStore.retrieveMetadata(Jets3tNativeFileSystemStore.java:119)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
at org.apache.hadoop.fs.s3native.$Proxy7.retrieveMetadata(Unknown Source)
at org.apache.hadoop.fs.s3native.NativeS3FileSystem.getFileStatus(NativeS3FileSystem.java:414)
at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1397)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:374)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:364)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:344)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:364)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:132)
at com.databricks.spark.avro.package$AvroDataFrameReader$$anonfun$avro$2.apply(package.scala:34)
at com.databricks.spark.avro.package$AvroDataFrameReader$$anonfun$avro$2.apply(package.scala:34)
at BasicS3Avro$.main(BasicS3Avro.scala:55)
at BasicS3Avro.main(BasicS3Avro.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Am I missing anything?
what does the newer, maintained, s3a client report?

Error when writing a repartitioned SchemaRDD to Parquet with Spark SQL

I am trying to write save Spark SQL tables to Parquet files. Because of other issues I need to reduce the number of partitions before writing. My code is
data.coalesce(1000,shuffle=true).saveAsParquetFile("s3n://...")
This throws
java.lang.NullPointerException
at org.apache.spark.SparkContext.getPreferredLocs(SparkContext.scala:927)
at org.apache.spark.rdd.PartitionCoalescer.currPrefLocs(CoalescedRDD.scala:174)
at org.apache.spark.rdd.PartitionCoalescer$LocationIterator$$anonfun$4$$anonfun$apply$2.apply(CoalescedRDD.scala:191)
at org.apache.spark.rdd.PartitionCoalescer$LocationIterator$$anonfun$4$$anonfun$apply$2.apply(CoalescedRDD.scala:190)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:350)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:350)
at org.apache.spark.rdd.PartitionCoalescer$LocationIterator.<init>(CoalescedRDD.scala:185)
at org.apache.spark.rdd.PartitionCoalescer.setupGroups(CoalescedRDD.scala:236)
at org.apache.spark.rdd.PartitionCoalescer.run(CoalescedRDD.scala:337)
at org.apache.spark.rdd.CoalescedRDD.getPartitions(CoalescedRDD.scala:83)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:202)
at org.apache.spark.rdd.MappedRDD.getPartitions(MappedRDD.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:202)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1128)
at org.apache.spark.sql.parquet.InsertIntoParquetTable.saveAsHadoopFile(ParquetTableOperations.scala:318)
at org.apache.spark.sql.parquet.InsertIntoParquetTable.execute(ParquetTableOperations.scala:246)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:409)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:409)
at org.apache.spark.sql.SchemaRDDLike$class.saveAsParquetFile(SchemaRDDLike.scala:77)
at org.apache.spark.sql.SchemaRDD.saveAsParquetFile(SchemaRDD.scala:103)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:22)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:27)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:29)
at $iwC$$iwC$$iwC.<init>(<console>:31)
at $iwC$$iwC.<init>(<console>:33)
at $iwC.<init>(<console>:35)
at <init>(<console>:37)
at .<init>(<console>:41)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:789)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1062)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:615)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:646)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:610)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:859)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:771)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:616)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:624)
at org.apache.spark.repl.SparkILoop.loop(SparkILoop.scala:629)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply$mcZ$sp(SparkILoop.scala:954)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:902)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:997)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:328)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
The code works fine if I take out the coalesce step and changing the code to use shuffle=true or using repartition throws the same error. I am using spark-1.1.0.

Resources