Partitioned delta table failing to write checkpoints? - apache-spark

I'm using pyspark with spark 3.2.1 and delta table package ("io.delta:delta-core_2.12:2.1.0").
I have partitioned my table by Code, Year and Month like below:
table.write.partitionBy("Code", "Year", "Month").format('delta').save(path)
When I run a merge on this table:
deltaTable.alias("t0").merge(
df.alias("t1"),
" t0.Code= t1.CodeAND "
" t0.Year = t1.Year AND "
" t0.Month = t1.Month AND "
" t0.Date = t1.Date"
).whenMatchedUpdate(
set={
...
}
).execute()
I got the following warning:
WARN DAGScheduler: Broadcasting large task binary with size 1861.8 KiB
Then I got the following error after some minutes:
py4j.protocol.Py4JJavaError: An error occurred while calling o2070.execute.
: java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.expressions.ElementAt$.apply$default$3()Lscala/Option;
at org.apache.spark.sql.delta.CheckpointV2$.$anonfun$extractPartitionValues$1(Checkpoints.scala:727)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at org.apache.spark.sql.types.StructType.foreach(StructType.scala:102)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at org.apache.spark.sql.types.StructType.map(StructType.scala:102)
at org.apache.spark.sql.delta.CheckpointV2$.extractPartitionValues(Checkpoints.scala:724)
at org.apache.spark.sql.delta.Checkpoints$.buildCheckpoint(Checkpoints.scala:689)
at org.apache.spark.sql.delta.Checkpoints$.$anonfun$writeCheckpoint$1(Checkpoints.scala:531)
at org.apache.spark.sql.delta.metering.DeltaLogging.withDmqTag(DeltaLogging.scala:143)
at org.apache.spark.sql.delta.metering.DeltaLogging.withDmqTag$(DeltaLogging.scala:142)
at org.apache.spark.sql.delta.Checkpoints$.withDmqTag(Checkpoints.scala:460)
at org.apache.spark.sql.delta.Checkpoints$.writeCheckpoint(Checkpoints.scala:487)
at org.apache.spark.sql.delta.Checkpoints.writeCheckpointFiles(Checkpoints.scala:361)
at org.apache.spark.sql.delta.Checkpoints.writeCheckpointFiles$(Checkpoints.scala:359)
at org.apache.spark.sql.delta.DeltaLog.writeCheckpointFiles(DeltaLog.scala:63)
at org.apache.spark.sql.delta.Checkpoints.checkpointAndCleanUpDeltaLog(Checkpoints.scala:346)
at org.apache.spark.sql.delta.Checkpoints.checkpointAndCleanUpDeltaLog$(Checkpoints.scala:344)
at org.apache.spark.sql.delta.DeltaLog.checkpointAndCleanUpDeltaLog(DeltaLog.scala:63)
at org.apache.spark.sql.delta.Checkpoints.$anonfun$checkpoint$2(Checkpoints.scala:318)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:139)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:137)
at org.apache.spark.sql.delta.DeltaLog.recordFrameProfile(DeltaLog.scala:63)
at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:132)
at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:77)
at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:67)
at org.apache.spark.sql.delta.DeltaLog.recordOperation(DeltaLog.scala:63)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:131)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:121)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:109)
at org.apache.spark.sql.delta.DeltaLog.recordDeltaOperation(DeltaLog.scala:63)
at org.apache.spark.sql.delta.Checkpoints.$anonfun$checkpoint$1(Checkpoints.scala:314)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.delta.metering.DeltaLogging.withDmqTag(DeltaLogging.scala:143)
at org.apache.spark.sql.delta.metering.DeltaLogging.withDmqTag$(DeltaLogging.scala:142)
at org.apache.spark.sql.delta.DeltaLog.withDmqTag(DeltaLog.scala:63)
at org.apache.spark.sql.delta.Checkpoints.checkpoint(Checkpoints.scala:313)
at org.apache.spark.sql.delta.Checkpoints.checkpoint$(Checkpoints.scala:312)
at org.apache.spark.sql.delta.DeltaLog.checkpoint(DeltaLog.scala:63)
at org.apache.spark.sql.delta.OptimisticTransactionImpl.postCommit(OptimisticTransaction.scala:1097)
at org.apache.spark.sql.delta.OptimisticTransactionImpl.postCommit$(OptimisticTransaction.scala:1092)
at org.apache.spark.sql.delta.OptimisticTransaction.postCommit(OptimisticTransaction.scala:101)
at org.apache.spark.sql.delta.OptimisticTransactionImpl.liftedTree1$1(OptimisticTransaction.scala:750)
at org.apache.spark.sql.delta.OptimisticTransactionImpl.$anonfun$commit$1(OptimisticTransaction.scala:691)
at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:139)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:137)
at org.apache.spark.sql.delta.OptimisticTransaction.recordFrameProfile(OptimisticTransaction.scala:101)
at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:132)
at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:77)
at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:67)
at org.apache.spark.sql.delta.OptimisticTransaction.recordOperation(OptimisticTransaction.scala:101)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:131)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:121)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:109)
at org.apache.spark.sql.delta.OptimisticTransaction.recordDeltaOperation(OptimisticTransaction.scala:101)
at org.apache.spark.sql.delta.OptimisticTransactionImpl.commit(OptimisticTransaction.scala:688)
at org.apache.spark.sql.delta.OptimisticTransactionImpl.commit$(OptimisticTransaction.scala:686)
at org.apache.spark.sql.delta.OptimisticTransaction.commit(OptimisticTransaction.scala:101)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2(MergeIntoCommand.scala:363)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2$adapted(MergeIntoCommand.scala:319)
at org.apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:221)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$1(MergeIntoCommand.scala:319)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:139)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:137)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordFrameProfile(MergeIntoCommand.scala:215)
at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:132)
at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:77)
at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:67)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordOperation(MergeIntoCommand.scala:215)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:131)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:121)
at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:109)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordDeltaOperation(MergeIntoCommand.scala:215)
at org.apache.spark.sql.delta.commands.MergeIntoCommand.run(MergeIntoCommand.scala:317)
at io.delta.tables.DeltaMergeBuilder.$anonfun$execute$1(DeltaMergeBuilder.scala:230)
at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError(AnalysisHelper.scala:104)
at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError$(AnalysisHelper.scala:90)
at io.delta.tables.DeltaMergeBuilder.improveUnsupportedOpError(DeltaMergeBuilder.scala:122)
at io.delta.tables.DeltaMergeBuilder.execute(DeltaMergeBuilder.scala:206)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
This error only happens when the table is partitioned! If I not use partitions on my folder my code runs normally.
I'm using this code inside a for loop, performing multiple updates on that table!
So my doubt is: Why is this happening when the table is partitioned?

You're using incompatible version of the Delta Lake - as you can see in the docs, the version 2.1.0 is compatible with Spark 3.3.0, so you can't use it with 3.2.1. You need to take 2.0.1 instead

Related

Special characters in Dataproc partition columns

I'm using Spark 3.1.2 on Goole Dataproc image version 2.0.15-debian10 with Dataproc managed metastore version 3.1.2. The following snippet works fine with a GCS backed table mydb.mytable:
from pyspark.sql import Row
spark.createDataFrame([Row(x='a', y=1)]).write.saveAsTable('mydb.mytable',
mode='overwrite',
partitionBy=['x'])
However, when adding a special character to partition column:
spark.createDataFrame([Row(x='ก', y=1)]).write.saveAsTable('mydb.mytable',
mode='overwrite',
partitionBy=['x'])
the operation fails with the following exception:
21/08/03 10:44:27 ERROR hive.ql.metadata.Hive: MetaException(message:Exception thrown when executing query : SELECT DISTINCT 'org.apache.hadoop.hive.metastore.model.MPartition' AS `NUCLEUS_TYPE`,`A0`.`CREATE_TIME`,`A0`.`LAST_ACCESS_TIME`,`A0`.`PART_NAME`,`A0`.`PART_ID` FROM `PARTITIONS` `A0` LEFT OUTER JOIN `TBLS` `B0` ON `A0`.`TBL_ID` = `B0`.`TBL_ID` LEFT OUTER JOIN `DBS` `C0` ON `B0`.`DB_ID` = `C0`.`DB_ID` WHERE `B0`.`TBL_NAME` = ? AND `C0`.`NAME` = ? AND `A0`.`PART_NAME` = ? AND `C0`.`CTLG_NAME` = ?)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$add_partitions_req_result$add_partitions_req_resultStandardScheme.read(ThriftHiveMetastore.java)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$add_partitions_req_result$add_partitions_req_resultStandardScheme.read(ThriftHiveMetastore.java)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$add_partitions_req_result.read(ThriftHiveMetastore.java)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:88)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.recv_add_partitions_req(ThriftHiveMetastore.java:1911)
at org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.add_partitions_req(ThriftHiveMetastore.java:1898)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.add_partitions(HiveMetaStoreClient.java:627)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:173)
at com.sun.proxy.$Proxy46.add_partitions(Unknown Source)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient$SynchronizedHandler.invoke(HiveMetaStoreClient.java:2336)
at com.sun.proxy.$Proxy46.add_partitions(Unknown Source)
at org.apache.hadoop.hive.ql.metadata.Hive.createPartitions(Hive.java:2097)
at org.apache.spark.sql.hive.client.Shim_v0_13.createPartitions(HiveShim.scala:555)
at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$createPartitions$1(HiveClientImpl.scala:609)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$withHiveState$1(HiveClientImpl.scala:291)
at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:224)
at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:223)
at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:273)
at org.apache.spark.sql.hive.client.HiveClientImpl.createPartitions(HiveClientImpl.scala:602)
at org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$createPartitions$1(HiveExternalCatalog.scala:1007)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:102)
at org.apache.spark.sql.hive.HiveExternalCatalog.createPartitions(HiveExternalCatalog.scala:989)
at org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createPartitions(ExternalCatalogWithListener.scala:201)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.createPartitions(SessionCatalog.scala:1050)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.$anonfun$addPartitions$1(ddl.scala:792)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.$anonfun$addPartitions$1$adapted(ddl.scala:774)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.addPartitions(ddl.scala:774)
at org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand.run(ddl.scala:672)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:192)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
at org.apache.spark.sql.DataFrameWriter.createTable(DataFrameWriter.scala:753)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:727)
at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:626)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
The data gets written correctly to GCS but updating the metastore fails. My guess is this is caused by using latin1 encoding in the managed metastore. Also, using spark.write.parquet(...) works (as the metastore is not used).
Is it possible to configure Dataproc to properly handle any UTF-8 values in partition columns? I would like to avoid using any encoding (like URL encoding) in the application logic.
The issue is that Hive Metastore's MySQL schema does not support that character:
PART_NAME varchar(767) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
https://github.com/apache/hive/blob/master/metastore/scripts/upgrade/mysql/hive-schema-2.3.0.mysql.sql#L211
You could reach out to the Hive community to check whether a larger character set is OK: https://github.com/apache/hive#useful-mailing-lists

py4j.protocol.Py4JJavaError: Class org.apache.hadoop.fs.azure.NativeAzureFileSystem not found

I am trying to read the csv file from pyspark, while reading it is throwing
py4j.protocol.Py4JJavaError: An error
occurred while calling o30.csv. : java.lang.RuntimeException:
java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.azure.NativeAzureFileSystem not found at
org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2595)
at
org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3269)
at
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3301)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at
org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479) at
org.apache.hadoop.fs.Path.getFileSystem(Path.java:361) at
org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
at
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:376)
at
org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:326)
at
org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:308)
at scala.Option.getOrElse(Option.scala:189) at
org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:308)
at
org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:796)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at
py4j.Gateway.invoke(Gateway.java:282) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:238) at
java.lang.Thread.run(Thread.java:748) Caused by:
java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.azure.NativeAzureFileSystem not found at
org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2499)
at
org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2593)
... 25 more
Process finished with exit code 1
Error, So could anyone please suggest me where am i doing wrong in below code.
from pyspark.sql import SparkSession
SECRET_ACCESS_KEY = "XXXXXXXXXXX"
STORAGE_NAME = "azuresvkstorageaccount11123"
CONTAINER = "inputstorage1"
FILE_NAME = "movies.csv"
spark = SparkSession.builder.appName("Azure_PySpark_Connectivity")\
.master("local[*]")\
.getOrCreate()
fs_acc_key = "fs.azure.account.key." + STORAGE_NAME + ".blob.core.windows.net"
spark.conf.set("spark.hadoop.fs.wasb.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set(fs_acc_key, SECRET_ACCESS_KEY)
file_path = "wasb://inputstorage1#azuresvkstorageaccount.blob.core.windows.net/movies.csv"
print(file_path)
Df = spark.read.csv(path=file_path,header=True,inferSchema=True) #Error Coming from this line it is unable to read the csv file
#Df.show(20,True)
I have figured out the problem, it is coming from maven jars, the solution is
Download hadoop-azure and azure-storage jars from the maven portal manually and copy these jars to spark/jars/. folder.
Do the same for the jetty-utils jar, add this jar to spark/jars/. folder
Then refresh and run the script again, it works perfectly.

Using Spark JDBC and Avatica to read records from a table in Apache Druid

I am trying to create a Dataframe in Spark that would contain all records from a table in Apache Druid and I am doing this using JDBC. Druid seems to be using the Calcite-Avatica JDBC driver (mentioned here).
df = spark.read.format('jdbc').option('url', 'jdbc:avatica:remote:url=http://172.31.5.20:8082/druid/v2/sql/avatica/').option('driver', 'org.apache.calcite.avatica.remote.Driver').option('dbtable', 'mytable').load()
But I am getting the following error
Py4JJavaError: An error occurred while calling o456.load.
: java.sql.SQLException: While closing connection
at org.apache.calcite.avatica.Helper.createException(Helper.java:39)
at org.apache.calcite.avatica.AvaticaConnection.close(AvaticaConnection.java:156)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:70)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:115)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:341)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:164)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException: Unrecognized field "rpcMetadata" (class org.apache.calcite.avatica.remote.Service$CloseConnectionResponse), not marked as ignorable (0 known properties: ])
at [Source: (String)"{"response":"closeConnection","rpcMetadata":{"response":"rpcMetadata","serverAddress":"ip-172-31-5-234.ap-southeast-1.compute.internal:8082"}}
"; line: 1, column: 46] (through reference chain: org.apache.calcite.avatica.remote.Service$CloseConnectionResponse["rpcMetadata"])
at org.apache.calcite.avatica.remote.JsonService.handle(JsonService.java:142)
at org.apache.calcite.avatica.remote.JsonService.apply(JsonService.java:229)
at org.apache.calcite.avatica.remote.RemoteMeta.closeConnection(RemoteMeta.java:78)
at org.apache.calcite.avatica.AvaticaConnection.close(AvaticaConnection.java:153)
... 18 more
Caused by: com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException: Unrecognized field "rpcMetadata" (class org.apache.calcite.avatica.remote.Service$CloseConnectionResponse), not marked as ignorable (0 known properties: ])
at [Source: (String)"{"response":"closeConnection","rpcMetadata":{"response":"rpcMetadata","serverAddress":"ip-172-31-5-234.ap-southeast-1.compute.internal:8082"}}
"; line: 1, column: 46] (through reference chain: org.apache.calcite.avatica.remote.Service$CloseConnectionResponse["rpcMetadata"])
at com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException.from(UnrecognizedPropertyException.java:61)
at com.fasterxml.jackson.databind.DeserializationContext.handleUnknownProperty(DeserializationContext.java:823)
at com.fasterxml.jackson.databind.deser.std.StdDeserializer.handleUnknownProperty(StdDeserializer.java:1153)
at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.handleUnknownProperty(BeanDeserializerBase.java:1589)
at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.handleUnknownVanilla(BeanDeserializerBase.java:1567)
at com.fasterxml.jackson.databind.deser.BeanDeserializer.vanillaDeserialize(BeanDeserializer.java:294)
at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeOther(BeanDeserializer.java:189)
at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserialize(BeanDeserializer.java:161)
at com.fasterxml.jackson.databind.jsontype.impl.AsPropertyTypeDeserializer._deserializeTypedForId(AsPropertyTypeDeserializer.java:130)
at com.fasterxml.jackson.databind.jsontype.impl.AsPropertyTypeDeserializer.deserializeTypedFromObject(AsPropertyTypeDeserializer.java:97)
at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.deserializeWithType(BeanDeserializerBase.java:1178)
at com.fasterxml.jackson.databind.deser.impl.TypeWrappedDeserializer.deserialize(TypeWrappedDeserializer.java:68)
at com.fasterxml.jackson.databind.ObjectMapper._readMapAndClose(ObjectMapper.java:4014)
at com.fasterxml.jackson.databind.ObjectMapper.readValue(ObjectMapper.java:3005)
at org.apache.calcite.avatica.remote.JsonService.decode(JsonService.java:131)
at org.apache.calcite.avatica.remote.JsonService.apply(JsonService.java:227)
... 20 more
Does anyone know what could be the cause for this and how do I fix this? It seems that this seems to be an issue with the Avatica driver where it gets a json object with an unrecognizable field
I am using the driver org.apache.calcite.avatica:avatica-core:1.17.0 and have added the jar file to my spark.jars property. I am using Druid 0.19.0 and Spark2.
Edit: I checked the source code for the Avatica JDBC framework and the constructor annotated as #JsonCreator expects a property called rpcMetadata in the json object being deserialized. Source code is here.
Use below library & try again.
"org.apache.calcite.avatica" % "avatica" % "1.8.0"

query apache drill form within apache spark

I am querying Apache drill from within apache spark. My question is, how to send sql commands other than select * from from spark to drill. By default, spark is sending the queries inside select * from. Also, when I am querying schema other than dfs, I am getting NullPointerException. Please help!
My spark version is 2.2.0
Here are my codes:
1. schema = dfs:
dataframe_mysql = spark.read.format("jdbc").option("url", "jdbc:drill:zk=%s;schema=%s;" % (foreman,schema)).option("driver","org.apache.drill.jdbc.Driver").option("dbtable","\"/user/titanic_data/test.csv\"").load()
Schema = MySQL
dataframe_mysql = spark.read.format("jdbc").option("url", "jdbc:drill:zk=%s;schema=MySQL;" % (foreman)).option("driver","org.apache.drill.jdbc.Driver").option("dbtable","MySQL.\"spark3\"").load()
This is the complete error:
Name: org.apache.toree.interpreter.broker.BrokerException
Message: Py4JJavaError: An error occurred while calling o40.load.
: java.sql.SQLException: Failed to create prepared statement: SYSTEM ERROR: NullPointerException
[Error Id: d1e4b310-f4df-4e7c-90ae-983cc5c89f94 on inpunpclx1825e.kih.kmart.com:31010]
at org.apache.drill.jdbc.impl.DrillJdbc41Factory.newServerPreparedStatement(DrillJdbc41Factory.java:147)
at org.apache.drill.jdbc.impl.DrillJdbc41Factory.newPreparedStatement(DrillJdbc41Factory.java:108)
at org.apache.drill.jdbc.impl.DrillJdbc41Factory.newPreparedStatement(DrillJdbc41Factory.java:50)
at oadd.org.apache.calcite.avatica.AvaticaConnection.prepareStatement(AvaticaConnection.java:278)
at org.apache.drill.jdbc.impl.DrillConnectionImpl.prepareStatement(DrillConnectionImpl.java:389)
at oadd.org.apache.calcite.avatica.AvaticaConnection.prepareStatement(AvaticaConnection.java:119)
at org.apache.drill.jdbc.impl.DrillConnectionImpl.prepareStatement(DrillConnectionImpl.java:422)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:60)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:113)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:47)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:306)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
(<class 'py4j.protocol.Py4JJavaError'>, Py4JJavaError('An error occurred while calling o40.load.\n', JavaObject id=o41), <traceback object at 0x7f00106d6488>)
StackTrace: org.apache.toree.interpreter.broker.BrokerState$$anonfun$markFailure$1.apply(BrokerState.scala:163)
org.apache.toree.interpreter.broker.BrokerState$$anonfun$markFailure$1.apply(BrokerState.scala:163)
scala.Option.foreach(Option.scala:257)
org.apache.toree.interpreter.broker.BrokerState.markFailure(BrokerState.scala:162)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:498)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:280)
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
py4j.commands.CallCommand.execute(CallCommand.java:79)
py4j.GatewayConnection.run(GatewayConnection.java:214)
java.lang.Thread.run(Thread.java:748)
I have changed the default drill quote from `` to "" so that there won't be any quoting identifier issue between spark and drill.

pyspark phoenix NullPointerException

trying to load data to spark from hbase 1.1 using phoenix-spark 4.9.0 on spark 2.1 but failing on following error:
>>> df = spark.read.format('org.apache.phoenix.spark').option('table', 'namespace.table').option('zkUrl', '10.0.1.1:2181').load()
: java.sql.SQLException: java.lang.RuntimeException: java.lang.NullPointerException
at org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2432)
at org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2352)
at org.apache.phoenix.util.PhoenixContextExecutor.call(PhoenixContextExecutor.java:76)
at org.apache.phoenix.query.ConnectionQueryServicesImpl.init(ConnectionQueryServicesImpl.java:2352)
at org.apache.phoenix.jdbc.PhoenixDriver.getConnectionQueryServices(PhoenixDriver.java:232)
at org.apache.phoenix.jdbc.PhoenixEmbeddedDriver.createConnection(PhoenixEmbeddedDriver.java:147)
at org.apache.phoenix.jdbc.PhoenixDriver.connect(PhoenixDriver.java:202)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:208)
at org.apache.phoenix.mapreduce.util.ConnectionUtil.getConnection(ConnectionUtil.java:98)
at org.apache.phoenix.mapreduce.util.ConnectionUtil.getInputConnection(ConnectionUtil.java:57)
at org.apache.phoenix.mapreduce.util.ConnectionUtil.getInputConnection(ConnectionUtil.java:45)
at org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.getSelectColumnMetadataList(PhoenixConfigurationUtil.java:279)
at org.apache.phoenix.spark.PhoenixRDD.toDataFrame(PhoenixRDD.scala:114)
at org.apache.phoenix.spark.PhoenixRelation.schema(PhoenixRelation.scala:60)
at org.apache.spark.sql.execution.datasources.LogicalRelation.<init>(LogicalRelation.scala:40)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:389)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
any clues anyone?

Resources