Why does Spark report "java.net.URISyntaxException: Relative path in absolute URI" when working with DataFrames? - apache-spark

I am running Spark locally on a Windows machine. I was able to launch the spark shell successfully and also read in text files as RDDs. I was also able to follow along the various online tutorials on this subject and was able to perform various operations on the RDDs.
However, when I try to convert an RDD into a DataFrame I am getting an error. This is what I am doing:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
//convert rdd to df
val df = rddFile.toDF()
This code generates a long series of error messages that seem to relate to the following one:
Caused by: java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: file:C:/Users/spark/spark-warehouse
at org.apache.hadoop.fs.Path.initialize(Path.java:205)
at org.apache.hadoop.fs.Path.<init>(Path.java:171)
at org.apache.hadoop.hive.metastore.Warehouse.getWhRoot(Warehouse.java:159)
at org.apache.hadoop.hive.metastore.Warehouse.getDefaultDatabasePath(Warehouse.java:177)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB_core(HiveMetaStore.java:600)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:620)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
... 85 more
Caused by: java.net.URISyntaxException: Relative path in absolute URI: file:C:/Users/spark/spark-warehouse
at java.net.URI.checkPath(URI.java:1823)
at java.net.URI.<init>(URI.java:745)
at org.apache.hadoop.fs.Path.initialize(Path.java:202)
... 96 more
The entire stack trace follows.
16/08/16 12:36:20 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
16/08/16 12:36:20 WARN Hive: Failed to access metastore. This class should not accessed in runtime.
org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at org.apache.hadoop.hive.ql.metadata.Hive.getAllDatabases(Hive.java:1236)
at org.apache.hadoop.hive.ql.metadata.Hive.reloadFunctions(Hive.java:174)
at org.apache.hadoop.hive.ql.metadata.Hive.<clinit>(Hive.java:166)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:171)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:258)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:359)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:263)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:39)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:38)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:46)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:45)
at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50)
at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:382)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:143)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:401)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:342)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:24)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:29)
at $line14.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:31)
at $line14.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:33)
at $line14.$read$$iw$$iw$$iw$$iw.<init>(<console>:35)
at $line14.$read$$iw$$iw$$iw.<init>(<console>:37)
at $line14.$read$$iw$$iw.<init>(<console>:39)
at $line14.$read$$iw.<init>(<console>:41)
at $line14.$read.<init>(<console>:43)
at $line14.$read$.<init>(<console>:47)
at $line14.$read$.<clinit>(<console>)
at $line14.$eval$.$print$lzycompute(<console>:7)
at $line14.$eval$.$print(<console>:6)
at $line14.$eval.$print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)
at scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)
at scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)
at scala.tools.nsc.interpreter.ILoop.interpretStartingWith(ILoop.scala:807)
at scala.tools.nsc.interpreter.ILoop.command(ILoop.scala:681)
at scala.tools.nsc.interpreter.ILoop.processLine(ILoop.scala:395)
at scala.tools.nsc.interpreter.ILoop.loop(ILoop.scala:415)
at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply$mcZ$sp(ILoop.scala:923)
at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply(ILoop.scala:909)
at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply(ILoop.scala:909)
at scala.reflect.internal.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:97)
at scala.tools.nsc.interpreter.ILoop.process(ILoop.scala:909)
at org.apache.spark.repl.Main$.doMain(Main.scala:68)
at org.apache.spark.repl.Main$.main(Main.scala:51)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:729)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1523)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
at org.apache.hadoop.hive.ql.metadata.Hive.getAllDatabases(Hive.java:1234)
... 74 more
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
... 80 more
Caused by: java.lang.IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: file:C:/Users/spark/spark-warehouse
at org.apache.hadoop.fs.Path.initialize(Path.java:205)
at org.apache.hadoop.fs.Path.<init>(Path.java:171)
at org.apache.hadoop.hive.metastore.Warehouse.getWhRoot(Warehouse.java:159)
at org.apache.hadoop.hive.metastore.Warehouse.getDefaultDatabasePath(Warehouse.java:177)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB_core(HiveMetaStore.java:600)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:620)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
... 85 more
Caused by: java.net.URISyntaxException: Relative path in absolute URI: file:C:/Users/spark/spark-warehouse
at java.net.URI.checkPath(URI.java:1823)
at java.net.URI.<init>(URI.java:745)
at org.apache.hadoop.fs.Path.initialize(Path.java:202)
... 96 more

It's the SPARK-15565 issue in Spark 2.0 on Windows with a simple solution (that appears to be part of Spark's codebase that may soon be released as 2.0.2 or 2.1.0).
The solution in Spark 2.0.0 is to set spark.sql.warehouse.dir to some properly-referenced directory, say file:///c:/Spark/spark-2.0.0-bin-hadoop2.7/spark-warehouse that uses /// (triple slashes).
Start spark-shell with --conf argument as follows:
spark-shell --conf spark.sql.warehouse.dir=file:///c:/tmp/spark-warehouse
Or create a SparkSession in your Spark application using the new fluent builder pattern as follows:
import org.apache.spark.sql.SparkSession
SparkSession spark = SparkSession
.builder()
.config("spark.sql.warehouse.dir", "file:///c:/tmp/spark-warehouse")
.getOrCreate()
Or create conf/spark-defaults.conf with the following content:
spark.sql.warehouse.dir file:///c:/tmp/spark-warehouse

I also got this error. In my case I was writting a file with the character ":" in the name ex: /tmp/dattaset_10:23:11. Then spark gets confused because thinks the : is part of a path (like C:\).
The solution is just to remove the : character from the file name

If you do want to fix it in code yet not touch exsiting code, can also pass it from system properties, such that the spark initializations which comes after won't change.
System.setProperty(
"spark.sql.warehouse.dir",
s"file:///${System.getProperty("user.dir")}/spark-warehouse"
.replaceAll("\\\\", "/")
)
Note, also this is using the current working dir, which can be replaced with "c:/tmp/", or any place you'd like the spark-warehouse dir.

Just set config("spark.sql.warehouse.dir", "file:///tmp/spark-warehouse").
No need to include "C:/".
Spark driver will automatically create folder under the /tmp/spark-warehouse directory. If it's windows, it will be under "C:/". The "C:" drive will not work if the master is not local on windows.

I set spark.sql.warehouse.dir property to fix the error in my existing code
System.setProperty("spark.sql.warehouse.dir", "file:///C:/spark-warehouse");
Here is the code snippet
System.setProperty("hadoop.home.dir", "c:/winutil/");
System.setProperty("spark.sql.warehouse.dir", "file:///C:/spark-warehouse");
val conf = new SparkConf().setAppName("test").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines = sc.textFile("C:/user.txt")

Related

py4j.protocol.Py4JJavaError: Class org.apache.hadoop.fs.azure.NativeAzureFileSystem not found

I am trying to read the csv file from pyspark, while reading it is throwing
py4j.protocol.Py4JJavaError: An error
occurred while calling o30.csv. : java.lang.RuntimeException:
java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.azure.NativeAzureFileSystem not found at
org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2595)
at
org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3269)
at
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3301)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:124)
at
org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479) at
org.apache.hadoop.fs.Path.getFileSystem(Path.java:361) at
org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
at
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:376)
at
org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:326)
at
org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:308)
at scala.Option.getOrElse(Option.scala:189) at
org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:308)
at
org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:796)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at
py4j.Gateway.invoke(Gateway.java:282) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:238) at
java.lang.Thread.run(Thread.java:748) Caused by:
java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.azure.NativeAzureFileSystem not found at
org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2499)
at
org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2593)
... 25 more
Process finished with exit code 1
Error, So could anyone please suggest me where am i doing wrong in below code.
from pyspark.sql import SparkSession
SECRET_ACCESS_KEY = "XXXXXXXXXXX"
STORAGE_NAME = "azuresvkstorageaccount11123"
CONTAINER = "inputstorage1"
FILE_NAME = "movies.csv"
spark = SparkSession.builder.appName("Azure_PySpark_Connectivity")\
.master("local[*]")\
.getOrCreate()
fs_acc_key = "fs.azure.account.key." + STORAGE_NAME + ".blob.core.windows.net"
spark.conf.set("spark.hadoop.fs.wasb.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set(fs_acc_key, SECRET_ACCESS_KEY)
file_path = "wasb://inputstorage1#azuresvkstorageaccount.blob.core.windows.net/movies.csv"
print(file_path)
Df = spark.read.csv(path=file_path,header=True,inferSchema=True) #Error Coming from this line it is unable to read the csv file
#Df.show(20,True)
I have figured out the problem, it is coming from maven jars, the solution is
Download hadoop-azure and azure-storage jars from the maven portal manually and copy these jars to spark/jars/. folder.
Do the same for the jetty-utils jar, add this jar to spark/jars/. folder
Then refresh and run the script again, it works perfectly.

scala> val sqlContext = new HiveContext(sc) java.lang.NoClassDefFoundError: org/apache/hadoop/hive/conf/HiveVariableSource

I am trying to run my access hive table over spark
Currently using CDH5.4 and hive version is 1.1 and Spark is 1.6
I have already copied the hive-site.xml file to spark conf folder.
if I am trying to run below code getting the error.
I am trying to get HiveContext so that I can access hive tables in Spark shell
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
val sparkConf = new SparkConf().setAppName("WordCount")
val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
getting error on below line :-
scala> val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
java.lang.NoClassDefFoundError: org/apache/hadoop/hive/conf/HiveVariableSource
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:33)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:38)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:40)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:42)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:44)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:46)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:48)
at $iwC$$iwC$$iwC.<init>(<console>:50)
at $iwC$$iwC.<init>(<console>:52)
at $iwC.<init>(<console>:54)
at <init>(<console>:56)
at .<init>(<console>:60)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
at org.apache.spark.repl.Main$.main(Main.scala:35)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:730)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hive.conf.HiveVariableSource
Updated below based on reply on HADOOP_CLASSPATH
I am using the CDH 5.4 , looks like HADOOP_CLASSPATH is not set
cloudera#quickstart ~]$ echo "Classpath=$HADOOP_CLASSPATH"
Classpath=
But below classpath gave me some output . Do I need my HADOOP_CLASSPATH or its fine ?
cloudera#quickstart ~]$ hadoop classpath
/etc/hadoop/conf:/usr/lib/hadoop/lib/*:/usr/lib/hadoop/.//*:/usr/lib/hadoop-hdfs/./:/usr/lib/hadoop-hdfs/lib/*:/usr/lib/hadoop-hdfs/.//*:/usr/lib/hadoop-yarn/lib/*:/usr/lib/hadoop-yarn/.//*:/usr/lib/hadoop-mapreduce/lib/*:/usr/lib/hadoop-mapreduce/.//*
Try this
Val SC = SparkContext.getOrCreate(sparkConf)
Then use your piece of code to create hivecontext

PySpark restart SparkContext on failure

I need to compute some aggregations for each table in a Hive database. My code is something like:
sc = SparkContext()
sqlContext = HiveContext(sc)
showtables_df = sqlContext.sql('show tables in my_db')
for onlinetable in showtables_df.select('tableName').rdd.collect():
hive_table_name = onlinetable['tableName']
try:
table_df = sqlContext.sql('SELECT * FROM my_db.' + hive_table_name)
table_df.count() # sample action
except Exception as e:
logger.error('Error in table ' + hive_table_name)
logger.error(type(e))
logger.error(e)
At a certain point, an huge table read causes an exception to be raised, and the SparkContext closes. From this point on, every call to the sqlContext fails:
[2018-05-20 07:40:36] ERROR - my_table: An error occurred while calling o40.sql.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:59)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
py4j.Gateway.invoke(Gateway.java:214)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
py4j.GatewayConnection.run(GatewayConnection.java:209)
java.lang.Thread.run(Thread.java:745)
The currently active SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:59)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
py4j.Gateway.invoke(Gateway.java:214)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
py4j.GatewayConnection.run(GatewayConnection.java:209)
java.lang.Thread.run(Thread.java:745)
at org.apache.spark.SparkContext.org$apache$spark$SparkContext$$assertNotStopped(SparkContext.scala:106)
at org.apache.spark.SparkContext$$anonfun$parallelize$1.apply(SparkContext.scala:729)
at org.apache.spark.SparkContext$$anonfun$parallelize$1.apply(SparkContext.scala:728)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:714)
at org.apache.spark.SparkContext.parallelize(SparkContext.scala:728)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:70)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:55)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:145)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:130)
at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:817)
at sun.reflect.GeneratedMethodAccessor23.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
As of now, I'm not able to add resources to my job, but I'd like to restart the SparkContext and go on computing aggregations in the remaining tables of the for loop.
Is it a good (and feasible) idea to check for the SparkContext to be closed in the exception handler, and eventually recreating it with:
...
except Exception as e:
logger.error('Error in table ' + hive_table_name)
logger.error(type(e))
logger.error(e)
if sc._jsc.sc().isStopped():
sc = SparkContext()
sqlContext = HiveContext(sc)
I'm working with Spark version 1.6.1, running jobs with --master yarn --deploy-mode client

SparkSQL key not found: scale

Spark version is 1.6.0.
I trying to do a simple SQL query to the remote Oracle 11g DB by using Spark SQL.
Of course the ojdbc driver is added to the classpath and ping to the DB is also ok.
SparkConf conf = new SparkConf().setAppName(APP_NAME).setMaster("yarn-client");
JavaSparkContext jsc = new JavaSparkContext(conf);
SqlContext sqlContext = new SqlContext(jsc );
Map<String, String> connectionProperties = new HashMap<>();
connectionProperties.put("user", username);
connectionProperties.put("password", password);
connectionProperties.put("url", url);
connectionProperties.put("dbtable", "(SELECT * FROM tableName)");
connectionProperties.put("driver", "oracle.jdbc.OracleDriver");
DataFrame result = sqlContext.read().format("jdbc").options(connectionProperties).load();
The error arise at the last line at .load() method.
The resulted stacktrace is:
Exception in thread "main" java.util.NoSuchElementException: key not found: scale
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:58)
at scala.collection.MapLike$class.apply(MapLike.scala:141)
at scala.collection.AbstractMap.apply(Map.scala:58)
at org.apache.spark.sql.types.Metadata.get(Metadata.scala:108)
at org.apache.spark.sql.types.Metadata.getLong(Metadata.scala:51)
at org.apache.spark.sql.jdbc.OracleDialect$.getCatalystType(OracleDialect.scala:33)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:140)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:91)
at org.apache.spark.sql.execution.datasources.jdbc.DefaultSource.createRelation(DefaultSource.scala:57)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:158)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119)
at myapp.dfComparator.entity.OriginalSourceTable.load(OriginalSourceTable.java:74)
at myapp.dfComparator.Program.main(Program.java:74)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
I have no mind, what is wrong.
EDIT 01/27/2017
Additional information:
Hadoop version is 2.6.0-cdh.5.8.3
Spark version is 1.6.0 with Scala version 2.10.5
I trying to reproduce the code above in scala and execute it using spark-shell:
val jdbcDF = sqlContext.read.format("jdbc").options(
Map("url" -> "jdbc:oracle:thin:system/system#db-host:1521:orcl",
"dbtable"-> "schema_name.table_name",
"driver"-> "oracle.jdbc.OracleDriver",
"username" -> "user",
"password" -> "pwd")).load()
The result of this code is the similiar stacktrace:
java.util.NoSuchElementException: key not found: scale
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:58)
at scala.collection.MapLike$class.apply(MapLike.scala:141)
at scala.collection.AbstractMap.apply(Map.scala:58)
at org.apache.spark.sql.types.Metadata.get(Metadata.scala:108)
at org.apache.spark.sql.types.Metadata.getLong(Metadata.scala:51)
at org.apache.spark.sql.jdbc.OracleDialect$.getCatalystType(OracleDialect.scala:33)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:140)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:91)
at org.apache.spark.sql.execution.datasources.jdbc.DefaultSource.createRelation(DefaultSource.scala:57)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:158)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:25)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:30)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:32)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:34)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:36)
at $iwC$$iwC$$iwC.<init>(<console>:38)
at $iwC$$iwC.<init>(<console>:40)
at $iwC.<init>(<console>:42)
at <init>(<console>:44)
at .<init>(<console>:48)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
So, I have strong perception that there is some mistakes in configuration of spark or(and) hadoop.
EDIT 02/01/2017
I investigate that such problem is arise only in case when the column type in oracle table is NUMBER. For example, if I cast the id column (which type is NUMBER) to VARCHAR in select statement, then all will works fine:
val jdbcDF = sqlContext.read.format("jdbc").options(
Map("url" -> "jdbc:oracle:thin:system/system#db-host:1521:orcl",
"dbtable"-> "(SELECT CAST(id AS varchar(3))) FROM tableName",
"driver"-> "oracle.jdbc.OracleDriver",
"username" -> "user",
"password" -> "pwd")).load()
In more detail - the staktrace show us, that problem is appear at org.apache.spark.sql.types.Metadata.get method. By investigate this method from sources we can see (or assume) that in case of NUMBER type it trying to cast it to Long and cannot find the scale of it.
Thats why now I think that main issue is in cloudera distr of apache spark.
At this time I received the answer from Cloudera that in CDH 5.8.3 with Spark 1.6 it is a known issue, which is resolved in Spark version 2.0.
For overcoming the problem we have 3 options:
In select query CAST any numeric type to VARCHAR. Instead of
CAST we also can use TO_CHAR function.
For loading data use RDD/JavaRDD (instead of Dataframe) and after that
convert it to Dataframes (which is much faster)
Use CDH 5.9.x

pyspark phoenix NullPointerException

trying to load data to spark from hbase 1.1 using phoenix-spark 4.9.0 on spark 2.1 but failing on following error:
>>> df = spark.read.format('org.apache.phoenix.spark').option('table', 'namespace.table').option('zkUrl', '10.0.1.1:2181').load()
: java.sql.SQLException: java.lang.RuntimeException: java.lang.NullPointerException
at org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2432)
at org.apache.phoenix.query.ConnectionQueryServicesImpl$13.call(ConnectionQueryServicesImpl.java:2352)
at org.apache.phoenix.util.PhoenixContextExecutor.call(PhoenixContextExecutor.java:76)
at org.apache.phoenix.query.ConnectionQueryServicesImpl.init(ConnectionQueryServicesImpl.java:2352)
at org.apache.phoenix.jdbc.PhoenixDriver.getConnectionQueryServices(PhoenixDriver.java:232)
at org.apache.phoenix.jdbc.PhoenixEmbeddedDriver.createConnection(PhoenixEmbeddedDriver.java:147)
at org.apache.phoenix.jdbc.PhoenixDriver.connect(PhoenixDriver.java:202)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:208)
at org.apache.phoenix.mapreduce.util.ConnectionUtil.getConnection(ConnectionUtil.java:98)
at org.apache.phoenix.mapreduce.util.ConnectionUtil.getInputConnection(ConnectionUtil.java:57)
at org.apache.phoenix.mapreduce.util.ConnectionUtil.getInputConnection(ConnectionUtil.java:45)
at org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.getSelectColumnMetadataList(PhoenixConfigurationUtil.java:279)
at org.apache.phoenix.spark.PhoenixRDD.toDataFrame(PhoenixRDD.scala:114)
at org.apache.phoenix.spark.PhoenixRelation.schema(PhoenixRelation.scala:60)
at org.apache.spark.sql.execution.datasources.LogicalRelation.<init>(LogicalRelation.scala:40)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:389)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
any clues anyone?

Resources