Error while using mapPartitions in Pyspark - apache-spark

I am new to Python spark and I am running the below spark code in the Jupyter notebook and getting AttributeError: 'NoneType' object has no attribute '_jvm'
My spark version is 3.0.1.
from pyspark.sql import functions as func
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator): yield func.sum(iterator)
parallel.mapPartitions(f).collect()
Find below the full error while running the code.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-55-44576a0dc413> in <module>
2 def valueSum(f): return func.sum(f)
3
----> 4 mapp.mapPartitions(valueSum).collect()
5 #one_through_9 = range(1,10)
6 #parallel = sc.parallelize(one_through_9, 3)
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
887 """
888 with SCCallSiteSync(self.context) as css:
--> 889 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
890 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
891
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 83, 192.168.43.228, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py", line 425, in func
return f(iterator)
File "<ipython-input-55-44576a0dc413>", line 2, in valueSum
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/functions.py", line 68, in _
jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
AttributeError: 'NoneType' object has no attribute '_jvm'

func.sum is for use with dataframes, not for summing numbers. Use the Python sum function instead:
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator):
yield sum(iterator)
parallel.mapPartitions(f).collect()
which will give [6, 15, 24].

Related

PySpark: Java Heap Error (Jupyter Notebook)

I am running a simple spark job, where in I am querying a table to get 3 columns and 7M rows. I tried various spark configs, but everytime I get Java Heap Space error.
Can someone please help me with this. I am trying to create an ETL process which computes data from 5 tables all of similar size, but I am getting java heap error when I am running the code with only 1 table. I tried to reduce the data volume as well but I still get the same error.
The tables are having >60 columns and Billions of rows of which I am getting only a subset of data for my process.
Please see below the code:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import *
import getpass
spark =SparkSession.builder.getOrCreate()
spark.sparkContext._conf.getAll()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'),
('spark.app.name', 'John Doe'), ('spark.executor.cores', '8'), ('spark.cores.max',
'8'),('spark.driver.memory','15g')])
spark.sparkContext.stop()
spark.sparkContext.stop()
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df=spark.sql("""
SELECT DISTINCT col1
,col2
,col3
from schema.table
where condition1
and condition2
and condition3
and condition4
""")
df.show()
Stacktrace:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480,
in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038,
in send_command
response = connection.send_command(command)
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503,
in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-4-8faeb4b518d0> in <module>
24
25
---> 26 df_upsell.show()
/opt/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
492
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
496 try:
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __call__(self,
*args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
/opt/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
332 format(target_id, ".", name, value))
333 else:
--> 334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o683.showString

Error while save the data from pyspark to HBase

I am trying to write Spark Dataframe to HBase using PySpark. I uploaded spark HBase dependencies. By using Jupyter notebook I am running the code.
Also, I have created a table in HBase in the default namespace.
I started pyspark by running the below command.
My spark version: spark 3.x
and HBase version: hbase-2.2.6
pyspark --packages com.hortonworks:shc:1.0.0-1.6-s_2.10 --repositories http://repo.hortonworks.com/content/groups/public/ --files /home/vijee/hbase-2.2.6-bin/conf/hbase-site.xml
The dependencies are successfully added
df = sc.parallelize([('a', 'def'), ('b', 'abc')]).toDF(schema=['col0', 'col1'])
catalog = ''.join("""{
"table":{"namespace":"default", "name":"smTable"},
"rowkey":"c1",
"columns":{
"col0":{"cf":"rowkey", "col":"c1", "type":"string"},
"col1":{"cf":"t1", "col":"c2", "type":"string"}
}
}""".split())
df.write.options(catalog=catalog).format('org.apache.spark.sql.execution.datasources.hbase').save()
When I run the above statement, I am getting the below error. Since I am new to spark I was not able to understand the error.
At first, I tried with my CSV file and faced the same ": java.lang.AbstractMethodError". Now I am using the sample data still getting the same error.
Py4JJavaError Traceback (most recent call last)
<ipython-input-9-cfcf107b1f03> in <module>
----> 1 df.write.options(catalog=catalog,newtable=5).format('org.apache.spark.sql.execution.datasources.hbase').save()
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
823 self.format(format)
824 if path is None:
--> 825 self._jwrite.save()
826 else:
827 self._jwrite.save(path)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o114.save.
: java.lang.AbstractMethodError: org.apache.spark.sql.execution.datasources.hbase.DefaultSource.createRelation(Lorg/apache/spark/sql/SQLContext;Lorg/apache/spark/sql/SaveMode;Lscala/collection/immutable/Map;Lorg/apache/spark/sql/Dataset;)Lorg/apache/spark/sql/sources/BaseRelation;
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)

K-NN in Pyspark

The following code for poker data set has been coded as for to classify the poker data set having 10 features(all numeric) and 10 class label(all numeric). I have used the sklearn's K-NN function in Pyspark with custom distance function. It throws an error while broadcasting K-NN model and predicting the test label. When I do not use a custom function it is not showing any error. Why is this happening?
x=sc.textFile("/home/ritesh/Spark/poker100.txt")
def parseLine(line):
cols = line.split(',') # split the txt file with ','
# label is the last column
label = cols[-1]
# vector is every column, except the label
vector = cols[:-1]
vector = [element for i, element in enumerate(vector) ]
# convert each value from string to float
vector = np.array(vector, dtype=np.float)
vector=vector.tolist()
return (label, vector)
x= x.map(parseLine)
train,test=x.randomSplit([0.7,0.3],seed=100)
train=train.map(lambda x: (x[0], x[1]))
test=test.map(lambda x: (x[0],x[1]))
X=train.map(lambda x: x[1])
#collect traing data
X=X.collect()
Y=train.map(lambda x: x[0])
#collect training label
Y=Y.collect()
y=test.map(lambda x: x[0])
# collect testing label
y=y.collect()
import math
def dist(x,y):#Euc. distance function to calculate distance between training and testing data
return np.sqrt(np.sum((x-y)**2))
import numpy as np
from sklearn.neighbors.ball_tree import BallTree
BallTree.valid_metrics
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3,algorithm='ball_tree', metric= dist)
model=knn.fit(X,Y) # fit KNN model
model=sc.broadcast(model)
testdata=test.map(lambda x: model.value.predict(np.array(x[1],dtype="float64").reshape(1,-1))) #predict test data
y_pred=testdata.collect()
on running it gives error:
Py4JJavaError
Traceback (most recent call last)
<ipython-input-113-a20ddffd3048> in <module>()
1 model=sc.broadcast(model)
2 testdata=test.map(lambda x: model.value.predict(np.array(x[1],dtype="float64").reshape(1,-1)))
----> 3 y_pred=testdata.collect()
/apps/spark-2.4.3/python/pyspark/rdd.py in collect(self)
814 """
815 with SCCallSiteSync(self.context) as css:
--> 816 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
817 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
818
/apps/spark-2.4.3/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/apps/spark-2.4.3/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/apps/spark-2.4.3/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 43.0 failed 1 times, most recent failure: Lost task 1.0 in stage 43.0 (TID 87, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
process()
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-113-a20ddffd3048>", line 2, in <lambda>
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/broadcast.py", line 148, in value
self._value = self.load_from_path(self._path)
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/broadcast.py", line 125, in load_from_path
return self.load(f)
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/broadcast.py", line 131, in load
return pickle.load(file)
AttributeError: Can't get attribute 'dist' on <module 'pyspark.daemon' from '/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/daemon.py'>

Unable to build SparkSession in Python

I'm new to Spark, and I'm using it in a jupyter notebook. I have the following code, which gives me an error:
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
I'm at a loss here, any suggestions as to what could be the problem?
The complete error is too long to post here, but this is part of it:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o23.sessionState.
: java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1053)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:129)
.
.
.
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
<ipython-input-2-17a54aa52bc2> in <module>()
1 # Boilerplate Spark stuff
2 #conf = SparkConf().setMaster("local").setAppName("Epidemiology")
----> 3 spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
4 #sc = SparkContext.getOrCreate(conf = conf)
5 #sc = SparkContext(conf = conf)
C:\spark\spark\python\pyspark\sql\session.py in getOrCreate(self)
177 session = SparkSession(sc)
178 for key, value in self._options.items():
--> 179 session._jsparkSession.sessionState().conf().setConfString(key, value)
180 for key, value in self._options.items():
181 session.sparkContext._conf.set(key, value)
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"

pyspark: sort an RDD by the object attribute

I have the following rdd named my_rdd, which looks like:
[FreqSequence(sequence=[['John']], freq=18980),
FreqSequence(sequence=[['Mary']], freq=106),
FreqSequence(sequence=[['John-Mary']], freq=381),
FreqSequence(sequence=[['John-Ann']], freq=158),
FreqSequence(sequence=[['Ann']], freq=433)]
I then tried to sort it like below:
new_rdd = my_rdd.sortBy(lambda x: x.freq)
new_rdd.take(5)
but got the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-15-94c1babd943f> in <module>()
1 print(my_rdd.take(5))
2 new_rdd = my_rdd.sortBy(lambda x: x.freq)
----> 3 new_rdd.take(5)
/usr/local/spark-latest/python/pyspark/rdd.py in take(self, num)
1341
1342 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1343 res = self.context.runJob(self, takeUpToNumLeft, p)
1344
1345 items += res
/usr/local/spark-latest/python/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
963 # SparkContext#runJob.
964 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 965 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
966 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
967
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/local/spark-latest/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 65.0 failed 4 times, most recent failure: Lost task 0.3 in stage 65.0 (TID 115, ph-hdp-inv-dn01, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/worker.py", line 163, in main
func, profiler, deserializer, serializer = read_command(pickleSer, infile)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/worker.py", line 54, in read_command
command = serializer._read_with_length(file)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
return self.loads(obj)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/serializers.py", line 431, in loads
return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'UserString'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Any idea what was wrong here? Thanks!
Your code is correct. Your error:
ImportError: No module named 'UserString'
is raised because UserString is no longer a module in in Python 3.x , but it is a part of the collections modules. This suggest that you either are using an outdated version of PySpark or one of its dependencies is outdated.

Resources