Error while save the data from pyspark to HBase - apache-spark

I am trying to write Spark Dataframe to HBase using PySpark. I uploaded spark HBase dependencies. By using Jupyter notebook I am running the code.
Also, I have created a table in HBase in the default namespace.
I started pyspark by running the below command.
My spark version: spark 3.x
and HBase version: hbase-2.2.6
pyspark --packages com.hortonworks:shc:1.0.0-1.6-s_2.10 --repositories http://repo.hortonworks.com/content/groups/public/ --files /home/vijee/hbase-2.2.6-bin/conf/hbase-site.xml
The dependencies are successfully added
df = sc.parallelize([('a', 'def'), ('b', 'abc')]).toDF(schema=['col0', 'col1'])
catalog = ''.join("""{
"table":{"namespace":"default", "name":"smTable"},
"rowkey":"c1",
"columns":{
"col0":{"cf":"rowkey", "col":"c1", "type":"string"},
"col1":{"cf":"t1", "col":"c2", "type":"string"}
}
}""".split())
df.write.options(catalog=catalog).format('org.apache.spark.sql.execution.datasources.hbase').save()
When I run the above statement, I am getting the below error. Since I am new to spark I was not able to understand the error.
At first, I tried with my CSV file and faced the same ": java.lang.AbstractMethodError". Now I am using the sample data still getting the same error.
Py4JJavaError Traceback (most recent call last)
<ipython-input-9-cfcf107b1f03> in <module>
----> 1 df.write.options(catalog=catalog,newtable=5).format('org.apache.spark.sql.execution.datasources.hbase').save()
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
823 self.format(format)
824 if path is None:
--> 825 self._jwrite.save()
826 else:
827 self._jwrite.save(path)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o114.save.
: java.lang.AbstractMethodError: org.apache.spark.sql.execution.datasources.hbase.DefaultSource.createRelation(Lorg/apache/spark/sql/SQLContext;Lorg/apache/spark/sql/SaveMode;Lscala/collection/immutable/Map;Lorg/apache/spark/sql/Dataset;)Lorg/apache/spark/sql/sources/BaseRelation;
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)

Related

pyspark setup issues on anaconda

Trying to learn to use pyspark with jupyter notebooks. created an env for pyspark and installed it in anaconda, python version is 3.10.8 and java version in the env is:
openjdk 17.0.3 2022-04-19 LTS
OpenJDK Runtime Environment Zulu17.34+19-CA (build 17.0.3+7-LTS)
OpenJDK 64-Bit Server VM Zulu17.34+19-CA (build 17.0.3+7-LTS, mixed mode, sharing)
when opening jupyter labs and trying to run my first spark session i've used:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("JupNote").getOrCreate()
and get the error:
Py4JJavaError Traceback (most recent call last)
c:\Users\frezanlutu\Skills_Training\BigData\pyspark.ipynb Cell 3 in <cell line: 1>()
----> 1 spark = SparkSession.builder.appName("JupNote").getOrCreate()
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\sql\session.py:228, in SparkSession.Builder.getOrCreate(self)
226 sparkConf.set(key, value)
227 # This SparkContext may be an existing one.
--> 228 sc = SparkContext.getOrCreate(sparkConf)
229 # Do not update `SparkConf` for existing `SparkContext`, as it's shared
230 # by all sessions.
231 session = SparkSession(sc)
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:392, in SparkContext.getOrCreate(cls, conf)
390 with SparkContext._lock:
391 if SparkContext._active_spark_context is None:
--> 392 SparkContext(conf=conf or SparkConf())
393 return SparkContext._active_spark_context
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:146, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
144 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
145 try:
--> 146 self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
147 conf, jsc, profiler_cls)
148 except:
149 # If an error occurs, clean up in order to allow future SparkContext creation:
150 self.stop()
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:209, in SparkContext._do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls)
206 self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")
208 # Create the Java SparkContext through Py4J
--> 209 self._jsc = jsc or self._initialize_context(self._conf._jconf)
210 # Reset the SparkConf to the one actually used by the SparkContext in JVM.
211 self._conf = SparkConf(_jconf=self._jsc.sc().conf())
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\pyspark\context.py:329, in SparkContext._initialize_context(self, jconf)
325 def _initialize_context(self, jconf):
326 """
327 Initialize SparkContext in function to allow subclass specific initialization
328 """
--> 329 return self._jvm.JavaSparkContext(jconf)
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\py4j\java_gateway.py:1585, in JavaClass.__call__(self, *args)
1579 command = proto.CONSTRUCTOR_COMMAND_NAME +\
1580 self._command_header +\
1581 args_command +\
1582 proto.END_COMMAND_PART
1584 answer = self._gateway_client.send_command(command)
-> 1585 return_value = get_return_value(
1586 answer, self._gateway_client, None, self._fqn)
1588 for temp_arg in temp_args:
1589 temp_arg._detach()
File c:\Users\frezanlutu\.conda\envs\pyspark-env\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.storage.StorageUtils$
at org.apache.spark.storage.BlockManagerMasterEndpoint.<init>(BlockManagerMasterEndpoint.scala:110)
at org.apache.spark.SparkEnv$.$anonfun$create$9(SparkEnv.scala:348)
at org.apache.spark.SparkEnv$.registerOrLookupEndpoint$1(SparkEnv.scala:287)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:336)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:191)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:277)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:460)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)
i've also tried:
spark = SparkSession.builder.config("spark.driver.host", "localhost").appName("JupNote").getOrCreate()
after looking for some solutions but that produces the same error. anyone know if i'm missing anything or doing anything wrong?
Which spark version are you using? If you are using Spark < 3.3.0, it only support Java 8. You may check https://spark.apache.org/docs/3.3.0/#downloading

PySpark: Java Heap Error (Jupyter Notebook)

I am running a simple spark job, where in I am querying a table to get 3 columns and 7M rows. I tried various spark configs, but everytime I get Java Heap Space error.
Can someone please help me with this. I am trying to create an ETL process which computes data from 5 tables all of similar size, but I am getting java heap error when I am running the code with only 1 table. I tried to reduce the data volume as well but I still get the same error.
The tables are having >60 columns and Billions of rows of which I am getting only a subset of data for my process.
Please see below the code:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import *
import getpass
spark =SparkSession.builder.getOrCreate()
spark.sparkContext._conf.getAll()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'),
('spark.app.name', 'John Doe'), ('spark.executor.cores', '8'), ('spark.cores.max',
'8'),('spark.driver.memory','15g')])
spark.sparkContext.stop()
spark.sparkContext.stop()
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df=spark.sql("""
SELECT DISTINCT col1
,col2
,col3
from schema.table
where condition1
and condition2
and condition3
and condition4
""")
df.show()
Stacktrace:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480,
in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038,
in send_command
response = connection.send_command(command)
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503,
in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-4-8faeb4b518d0> in <module>
24
25
---> 26 df_upsell.show()
/opt/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
492
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
496 try:
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __call__(self,
*args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
/opt/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
332 format(target_id, ".", name, value))
333 else:
--> 334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o683.showString

Error while using mapPartitions in Pyspark

I am new to Python spark and I am running the below spark code in the Jupyter notebook and getting AttributeError: 'NoneType' object has no attribute '_jvm'
My spark version is 3.0.1.
from pyspark.sql import functions as func
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator): yield func.sum(iterator)
parallel.mapPartitions(f).collect()
Find below the full error while running the code.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-55-44576a0dc413> in <module>
2 def valueSum(f): return func.sum(f)
3
----> 4 mapp.mapPartitions(valueSum).collect()
5 #one_through_9 = range(1,10)
6 #parallel = sc.parallelize(one_through_9, 3)
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
887 """
888 with SCCallSiteSync(self.context) as css:
--> 889 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
890 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
891
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 83, 192.168.43.228, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py", line 425, in func
return f(iterator)
File "<ipython-input-55-44576a0dc413>", line 2, in valueSum
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/functions.py", line 68, in _
jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
AttributeError: 'NoneType' object has no attribute '_jvm'
func.sum is for use with dataframes, not for summing numbers. Use the Python sum function instead:
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator):
yield sum(iterator)
parallel.mapPartitions(f).collect()
which will give [6, 15, 24].

Pyspark error - py4j.Py4JException: Method limit([class java.lang.String]) does not exist

When executing a code to get a spark dataframe from HDFS and then convert it to pandas dataframe,
spark_df = spark.read.parquet(*data_paths)
# other code in the process like filtering, groupby etc.
# ....
# write sparkdf to hadoop, get n rows if specified
if n:
spark_df.limit(n).write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
else:
spark_df.write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
I get an error:
/home/sarah/anaconda3/envs/py27/lib/python2.7/site-packages/dspipeline/core/wf_spark.pyc in to_pd(spark_df, n, save_csv, csv_sep, csv_quote, quick)
215 # write sparkdf to hadoop, get n rows if specified
216 if n:
--> 217 spark_df.limit(n).write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
218 else:
219 spark_df.write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/sql/dataframe.py in limit(self, num)
472 []
473 """
--> 474 jdf = self._jdf.limit(num)
475 return DataFrame(jdf, self.sql_ctx)
476
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
321 raise Py4JError(
322 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 323 format(target_id, ".", name, value))
324 else:
325 raise Py4JError(
Py4JError: An error occurred while calling o1086.limit. Trace:
py4j.Py4JException: Method limit([class java.lang.String]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
As I found the function limit(num) in pyspark documentation, I guess the reason is that I'm not correctly using it. Any help?
The exception is pretty clear here:
Method limit([class java.lang.String]) does not exist
n you are trying to pass to limit is not an int but a str.
You should go back to the point where n is defined, and fix it.
There exits .limit method for DataFrame, if you want to get the n rows from a DataFrame, you can use .limit(n) method, but parameter n is must be integer.
example:
df.limit(10)
if you use the other param like df.limit('10'), an error will occurre:
py4j.Py4JException: Method limit([class java.lang.String]) does not exist.
If you pass a number greater than 2,147,483,647 you will get the same error because limit expects an IntegerType.
IntegerType: Represents 4-byte signed integer numbers. The range of numbers is from -2147483648 to 2147483647.

Unable to build SparkSession in Python

I'm new to Spark, and I'm using it in a jupyter notebook. I have the following code, which gives me an error:
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
I'm at a loss here, any suggestions as to what could be the problem?
The complete error is too long to post here, but this is part of it:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o23.sessionState.
: java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1053)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:129)
.
.
.
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
<ipython-input-2-17a54aa52bc2> in <module>()
1 # Boilerplate Spark stuff
2 #conf = SparkConf().setMaster("local").setAppName("Epidemiology")
----> 3 spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
4 #sc = SparkContext.getOrCreate(conf = conf)
5 #sc = SparkContext(conf = conf)
C:\spark\spark\python\pyspark\sql\session.py in getOrCreate(self)
177 session = SparkSession(sc)
178 for key, value in self._options.items():
--> 179 session._jsparkSession.sessionState().conf().setConfString(key, value)
180 for key, value in self._options.items():
181 session.sparkContext._conf.set(key, value)
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"

Resources