mllib KernelDensity error - python-3.x

I'm trying to use pyspark.mllib.stat.KernelDensity this way:
data = sc.parallelize([0, 1, 2, 2, 1, 1, 1, 1, 1, 2, 0, 0])
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(3)
densities = kd.estimate([-1.0, 2.0, 5.0])
but eventually get this error:
--------------------------------------------------------------------------- Py4JError Traceback (most recent call
last) in ()
8
9 # Find density estimates for the given values
---> 10 densities = kd.estimate([-1.0, 2.0, 5.0])
/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/stat/KernelDensity.py
in estimate(self, points)
56 points = list(points)
57 densities = callMLlibFunc(
---> 58 "estimateKernelDensity", self._sample, self._bandwidth, points)
59 return np.asarray(densities)
/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py
in callMLlibFunc(name, *args)
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
130 print(api)
--> 131 return callJavaFunc(sc, api, *args)
132
133
/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py
in callJavaFunc(sc, func, *args)
121 """ Call Java Function """
122 args = [_py2java(sc, a) for a in args]
--> 123 return _java2py(sc, func(*args))
124
125
/home/user10215193/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py
in call(self, *args) 1131 answer =
self.gateway_client.send_command(command) 1132 return_value
= get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name) 1134 1135 for temp_arg in temp_args:
/home/user10215193/anaconda3/lib/python3.6/site-packages/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
321 raise Py4JError(
322 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 323 format(target_id, ".", name, value))
324 else:
325 raise Py4JError(
Py4JError: An error occurred while calling o19.estimateKernelDensity.
Trace: py4j.Py4JException: Method estimateKernelDensity([class
org.apache.spark.api.java.JavaRDD, class java.lang.Integer, class
java.util.ArrayList]) does not exist at
py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at
py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:214) at
java.lang.Thread.run(Thread.java:748)
I couldn't find anything similar here so if somebody can help me with this I would much appreciate it.

You have to be careful about the types:
bandwidth has to be float
sample has to be RDD[float]
So replace your code with:
kd.setSample(data.map(float))
kd.setBandwidth(3.0)
densities = kd.estimate([-1.0, 2.0, 5.0])
and you'll be fine.

Related

Error while using mapPartitions in Pyspark

I am new to Python spark and I am running the below spark code in the Jupyter notebook and getting AttributeError: 'NoneType' object has no attribute '_jvm'
My spark version is 3.0.1.
from pyspark.sql import functions as func
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator): yield func.sum(iterator)
parallel.mapPartitions(f).collect()
Find below the full error while running the code.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-55-44576a0dc413> in <module>
2 def valueSum(f): return func.sum(f)
3
----> 4 mapp.mapPartitions(valueSum).collect()
5 #one_through_9 = range(1,10)
6 #parallel = sc.parallelize(one_through_9, 3)
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
887 """
888 with SCCallSiteSync(self.context) as css:
--> 889 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
890 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
891
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 83, 192.168.43.228, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py", line 425, in func
return f(iterator)
File "<ipython-input-55-44576a0dc413>", line 2, in valueSum
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/functions.py", line 68, in _
jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
AttributeError: 'NoneType' object has no attribute '_jvm'
func.sum is for use with dataframes, not for summing numbers. Use the Python sum function instead:
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator):
yield sum(iterator)
parallel.mapPartitions(f).collect()
which will give [6, 15, 24].

Pyspark error - py4j.Py4JException: Method limit([class java.lang.String]) does not exist

When executing a code to get a spark dataframe from HDFS and then convert it to pandas dataframe,
spark_df = spark.read.parquet(*data_paths)
# other code in the process like filtering, groupby etc.
# ....
# write sparkdf to hadoop, get n rows if specified
if n:
spark_df.limit(n).write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
else:
spark_df.write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
I get an error:
/home/sarah/anaconda3/envs/py27/lib/python2.7/site-packages/dspipeline/core/wf_spark.pyc in to_pd(spark_df, n, save_csv, csv_sep, csv_quote, quick)
215 # write sparkdf to hadoop, get n rows if specified
216 if n:
--> 217 spark_df.limit(n).write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
218 else:
219 spark_df.write.csv(tmpfoldername, sep=csv_sep, quote=csv_quote)
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/sql/dataframe.py in limit(self, num)
472 []
473 """
--> 474 jdf = self._jdf.limit(num)
475 return DataFrame(jdf, self.sql_ctx)
476
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/opt/spark-2.3.0-SNAPSHOT-bin-spark-master/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
321 raise Py4JError(
322 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 323 format(target_id, ".", name, value))
324 else:
325 raise Py4JError(
Py4JError: An error occurred while calling o1086.limit. Trace:
py4j.Py4JException: Method limit([class java.lang.String]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
As I found the function limit(num) in pyspark documentation, I guess the reason is that I'm not correctly using it. Any help?
The exception is pretty clear here:
Method limit([class java.lang.String]) does not exist
n you are trying to pass to limit is not an int but a str.
You should go back to the point where n is defined, and fix it.
There exits .limit method for DataFrame, if you want to get the n rows from a DataFrame, you can use .limit(n) method, but parameter n is must be integer.
example:
df.limit(10)
if you use the other param like df.limit('10'), an error will occurre:
py4j.Py4JException: Method limit([class java.lang.String]) does not exist.
If you pass a number greater than 2,147,483,647 you will get the same error because limit expects an IntegerType.
IntegerType: Represents 4-byte signed integer numbers. The range of numbers is from -2147483648 to 2147483647.

Error while running PageRank and BFS functions on Graphframes in PySpark

I'm new to Spark, and am learning it on the Cloudera Distr for Hadoop (CDH). I'm trying to execute the PageRank and BFS functions through Jupyter Notebook, which was initiated using the following command:
pyspark --packages graphframes:graphframes:0.1.0-spark1.6,com.databricks:spark-csv_2.11:1.2.0
The below is the PageRank function command I tried to run, along with the error message:
ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-20-34d549cc033e> in <module>()
----> 1 ranks = tripGraph.pageRank(resetProbability=0.15, maxIter=5)
2 ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(20).show()
/tmp/spark-3bdc323d-a439-4f0a-ac1d-4e64ef4d1396/userFiles-0c248c5c-29fc-44c7-bfd9-3543500350dc/graphframes_graphframes-0.1.0-spark1.6.jar/graphframes/graphframe.pyc in pageRank(self, resetProbability, sourceId, maxIter, tol)
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o106.run.
: java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:50)
at org.apache.spark.graphx.lib.backport.PageRank$.log(PageRank.scala:65)
at org.apache.spark.Logging$class.logInfo(Logging.scala:58)
at org.apache.spark.graphx.lib.backport.PageRank$.logInfo(PageRank.scala:65)
at org.apache.spark.graphx.lib.backport.PageRank$.runWithOptions(PageRank.scala:148)
at org.graphframes.lib.PageRank$.run(PageRank.scala:130)
at org.graphframes.lib.PageRank.run(PageRank.scala:104)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
I'm getting the same error messages for the BFS function I'm trying:
filteredPaths = tripGraph.bfs(
fromExpr = "id = 'SEA'",
toExpr = "id = 'SFO'",
maxPathLength = 1)
Output:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-22-74394b11f50d> in <module>()
4 fromExpr = "id = 'SEA'",
5 toExpr = "id = 'SFO'",
----> 6 maxPathLength = 1)
7
8 filteredPaths.show()
/tmp/spark-3bdc323d-a439-4f0a-ac1d-4e64ef4d1396/userFiles-0c248c5c-29fc-44c7-bfd9-3543500350dc/graphframes_graphframes-0.1.0-spark1.6.jar/graphframes/graphframe.pyc in bfs(self, fromExpr, toExpr, edgeFilter, maxPathLength)
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/usr/lib/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/usr/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o147.run.
: java.lang.AbstractMethodError
at org.apache.spark.Logging$class.log(Logging.scala:50)
at org.graphframes.lib.BFS$.log(BFS.scala:131)
at org.apache.spark.Logging$class.logInfo(Logging.scala:58)
at org.graphframes.lib.BFS$.logInfo(BFS.scala:131)
at org.graphframes.lib.BFS$.org$graphframes$lib$BFS$$run(BFS.scala:212)
at org.graphframes.lib.BFS.run(BFS.scala:126)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Can you please let me know the issue?
Thanks, Sasi.
You are using incompatible Scala versions:
graphframes:graphframes:0.1.0-spark1.6 - Scala 2.10
com.databricks:spark-csv_2.11:1.2.0 - Scala 2.11
Spark installation - Probably Scala 2.10.
You have to use the same Scala version for all components (com.databricks:spark-csv_2.10:1.2.0 if Spark is compiled with Scala 2.10). Please consult Resolving dependency problems in Apache Spark for details.

Unable to build SparkSession in Python

I'm new to Spark, and I'm using it in a jupyter notebook. I have the following code, which gives me an error:
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
I'm at a loss here, any suggestions as to what could be the problem?
The complete error is too long to post here, but this is part of it:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o23.sessionState.
: java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1053)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:129)
.
.
.
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
<ipython-input-2-17a54aa52bc2> in <module>()
1 # Boilerplate Spark stuff
2 #conf = SparkConf().setMaster("local").setAppName("Epidemiology")
----> 3 spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
4 #sc = SparkContext.getOrCreate(conf = conf)
5 #sc = SparkContext(conf = conf)
C:\spark\spark\python\pyspark\sql\session.py in getOrCreate(self)
177 session = SparkSession(sc)
178 for key, value in self._options.items():
--> 179 session._jsparkSession.sessionState().conf().setConfString(key, value)
180 for key, value in self._options.items():
181 session.sparkContext._conf.set(key, value)
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"

has training error using pyspark ALS

I run Spark on a virtual machine and implemented ALS library to train my data.
rawRatings = sc.textFile('data/ratings.csv').map(lambda x: x.replace('\t', ','))
parsedRatings = rawRatings.map(lambda x: x.split(',')).map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))
trainData, valData, testData = parsedRatings.randomSplit([0.6, 0.2, 0.2], seed=42)
model = ALS.train(trainData, rank=8, iterations=5, lambda_=0.1)
It works. But if I tuned iteration=10, then it shows the error message:
Py4JJavaError Traceback (most recent call last)
<ipython-input-181-e64eb91ba0eb> in <module>()
6 regularization_parameter = 0.1
7 tolerance = 0.02
----> 8 model = ALS.train(trainData, rank=8, seed=seed, iterations=7, lambda_=regularization_parameter)
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/mllib/recommendation.py in train(cls, ratings, rank, iterations, lambda_, blocks, nonnegative, seed)
138 seed=None):
139 model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
--> 140 lambda_, blocks, nonnegative, seed)
141 return MatrixFactorizationModel(model)
142
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
118 sc = SparkContext._active_spark_context
119 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 120 return callJavaFunc(sc, api, *args)
121
122
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
111 """ Call Java Function """
112 args = [_py2java(sc, a) for a in args]
--> 113 return _java2py(sc, func(*args))
114
115
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o7508.trainALSModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 14882.0 failed 1 times, most recent failure: Lost task 0.0 in stage 14882.0 (TID 3699, localhost): java.lang.StackOverflowError
at java.io.ObjectInputStream$PeekInputStream.peek(ObjectInputStream.java:2293)
at java.io.ObjectInputStream$BlockDataInputStream.peek(ObjectInputStream.java:2586)
at java.io.ObjectInputStream$BlockDataInputStream.peekByte(ObjectInputStream.java:2596)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1505)
.....
I am just wondering what's wrong with that? It is ok to tune iterations =6,
but iterations = 7 will start to have such error message again. I used it
in iPython and Python 3.x version. Thanks for any generous answers!

Resources