I run Spark on a virtual machine and implemented ALS library to train my data.
rawRatings = sc.textFile('data/ratings.csv').map(lambda x: x.replace('\t', ','))
parsedRatings = rawRatings.map(lambda x: x.split(',')).map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))
trainData, valData, testData = parsedRatings.randomSplit([0.6, 0.2, 0.2], seed=42)
model = ALS.train(trainData, rank=8, iterations=5, lambda_=0.1)
It works. But if I tuned iteration=10, then it shows the error message:
Py4JJavaError Traceback (most recent call last)
<ipython-input-181-e64eb91ba0eb> in <module>()
6 regularization_parameter = 0.1
7 tolerance = 0.02
----> 8 model = ALS.train(trainData, rank=8, seed=seed, iterations=7, lambda_=regularization_parameter)
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/mllib/recommendation.py in train(cls, ratings, rank, iterations, lambda_, blocks, nonnegative, seed)
138 seed=None):
139 model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
--> 140 lambda_, blocks, nonnegative, seed)
141 return MatrixFactorizationModel(model)
142
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
118 sc = SparkContext._active_spark_context
119 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 120 return callJavaFunc(sc, api, *args)
121
122
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
111 """ Call Java Function """
112 args = [_py2java(sc, a) for a in args]
--> 113 return _java2py(sc, func(*args))
114
115
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/bin/spark-1.3.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o7508.trainALSModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 14882.0 failed 1 times, most recent failure: Lost task 0.0 in stage 14882.0 (TID 3699, localhost): java.lang.StackOverflowError
at java.io.ObjectInputStream$PeekInputStream.peek(ObjectInputStream.java:2293)
at java.io.ObjectInputStream$BlockDataInputStream.peek(ObjectInputStream.java:2586)
at java.io.ObjectInputStream$BlockDataInputStream.peekByte(ObjectInputStream.java:2596)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1505)
.....
I am just wondering what's wrong with that? It is ok to tune iterations =6,
but iterations = 7 will start to have such error message again. I used it
in iPython and Python 3.x version. Thanks for any generous answers!
Related
I am following the Spark MLexample here,
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
However, model1 = lr.fit(training) gives the following error message.
---------------------------------------------------------------------------
IllegalArgumentException Traceback (most recent call last)
<ipython-input-14-3e398ce8c8bd> in <module>
1 # Learn a LogisticRegression model. This uses the parameters stored in lr.
----> 2 model1 = lr.fit(training)
C:\spark\spark-3.0.2-bin-hadoop2.7\python\pyspark\ml\base.py in fit(self, dataset, params)
127 return self.copy(params)._fit(dataset)
128 else:
--> 129 return self._fit(dataset)
130 else:
131 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
C:\spark\spark-3.0.2-bin-hadoop2.7\python\pyspark\ml\wrapper.py in _fit(self, dataset)
319
320 def _fit(self, dataset):
--> 321 java_model = self._fit_java(dataset)
322 model = self._create_model(java_model)
323 return self._copyValues(model)
C:\spark\spark-3.0.2-bin-hadoop2.7\python\pyspark\ml\wrapper.py in _fit_java(self, dataset)
316 """
317 self._transfer_params_to_java()
--> 318 return self._java_obj.fit(dataset._jdf)
319
320 def _fit(self, dataset):
C:\spark\spark-3.0.2-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
C:\spark\spark-3.0.2-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
C:\spark\spark-3.0.2-bin-hadoop2.7\python\pyspark\sql\utils.py in raise_from(e)
IllegalArgumentException: requirement failed: Column features must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.
I am trying to write Spark Dataframe to HBase using PySpark. I uploaded spark HBase dependencies. By using Jupyter notebook I am running the code.
Also, I have created a table in HBase in the default namespace.
I started pyspark by running the below command.
My spark version: spark 3.x
and HBase version: hbase-2.2.6
pyspark --packages com.hortonworks:shc:1.0.0-1.6-s_2.10 --repositories http://repo.hortonworks.com/content/groups/public/ --files /home/vijee/hbase-2.2.6-bin/conf/hbase-site.xml
The dependencies are successfully added
df = sc.parallelize([('a', 'def'), ('b', 'abc')]).toDF(schema=['col0', 'col1'])
catalog = ''.join("""{
"table":{"namespace":"default", "name":"smTable"},
"rowkey":"c1",
"columns":{
"col0":{"cf":"rowkey", "col":"c1", "type":"string"},
"col1":{"cf":"t1", "col":"c2", "type":"string"}
}
}""".split())
df.write.options(catalog=catalog).format('org.apache.spark.sql.execution.datasources.hbase').save()
When I run the above statement, I am getting the below error. Since I am new to spark I was not able to understand the error.
At first, I tried with my CSV file and faced the same ": java.lang.AbstractMethodError". Now I am using the sample data still getting the same error.
Py4JJavaError Traceback (most recent call last)
<ipython-input-9-cfcf107b1f03> in <module>
----> 1 df.write.options(catalog=catalog,newtable=5).format('org.apache.spark.sql.execution.datasources.hbase').save()
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
823 self.format(format)
824 if path is None:
--> 825 self._jwrite.save()
826 else:
827 self._jwrite.save(path)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o114.save.
: java.lang.AbstractMethodError: org.apache.spark.sql.execution.datasources.hbase.DefaultSource.createRelation(Lorg/apache/spark/sql/SQLContext;Lorg/apache/spark/sql/SaveMode;Lscala/collection/immutable/Map;Lorg/apache/spark/sql/Dataset;)Lorg/apache/spark/sql/sources/BaseRelation;
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
I am new to Python spark and I am running the below spark code in the Jupyter notebook and getting AttributeError: 'NoneType' object has no attribute '_jvm'
My spark version is 3.0.1.
from pyspark.sql import functions as func
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator): yield func.sum(iterator)
parallel.mapPartitions(f).collect()
Find below the full error while running the code.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-55-44576a0dc413> in <module>
2 def valueSum(f): return func.sum(f)
3
----> 4 mapp.mapPartitions(valueSum).collect()
5 #one_through_9 = range(1,10)
6 #parallel = sc.parallelize(one_through_9, 3)
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
887 """
888 with SCCallSiteSync(self.context) as css:
--> 889 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
890 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
891
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 83, 192.168.43.228, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py", line 425, in func
return f(iterator)
File "<ipython-input-55-44576a0dc413>", line 2, in valueSum
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/functions.py", line 68, in _
jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
AttributeError: 'NoneType' object has no attribute '_jvm'
func.sum is for use with dataframes, not for summing numbers. Use the Python sum function instead:
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator):
yield sum(iterator)
parallel.mapPartitions(f).collect()
which will give [6, 15, 24].
I'm new to Spark, and I'm using it in a jupyter notebook. I have the following code, which gives me an error:
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
I'm at a loss here, any suggestions as to what could be the problem?
The complete error is too long to post here, but this is part of it:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o23.sessionState.
: java.lang.IllegalArgumentException: Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1053)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at org.apache.spark.sql.SparkSession$$anonfun$sessionState$2.apply(SparkSession.scala:130)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:129)
.
.
.
During handling of the above exception, another exception occurred:
IllegalArgumentException Traceback (most recent call last)
<ipython-input-2-17a54aa52bc2> in <module>()
1 # Boilerplate Spark stuff
2 #conf = SparkConf().setMaster("local").setAppName("Epidemiology")
----> 3 spark = SparkSession.builder.master("local").appName("Epidemiology").config(conf = SparkConf()).getOrCreate()
4 #sc = SparkContext.getOrCreate(conf = conf)
5 #sc = SparkContext(conf = conf)
C:\spark\spark\python\pyspark\sql\session.py in getOrCreate(self)
177 session = SparkSession(sc)
178 for key, value in self._options.items():
--> 179 session._jsparkSession.sessionState().conf().setConfString(key, value)
180 for key, value in self._options.items():
181 session.sparkContext._conf.set(key, value)
C:\spark\spark\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionStateBuilder':"
I'm trying to use pyspark.mllib.stat.KernelDensity this way:
data = sc.parallelize([0, 1, 2, 2, 1, 1, 1, 1, 1, 2, 0, 0])
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(3)
densities = kd.estimate([-1.0, 2.0, 5.0])
but eventually get this error:
--------------------------------------------------------------------------- Py4JError Traceback (most recent call
last) in ()
8
9 # Find density estimates for the given values
---> 10 densities = kd.estimate([-1.0, 2.0, 5.0])
/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/stat/KernelDensity.py
in estimate(self, points)
56 points = list(points)
57 densities = callMLlibFunc(
---> 58 "estimateKernelDensity", self._sample, self._bandwidth, points)
59 return np.asarray(densities)
/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py
in callMLlibFunc(name, *args)
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
130 print(api)
--> 131 return callJavaFunc(sc, api, *args)
132
133
/home/user10215193/anaconda3/lib/python3.6/site-packages/pyspark/mllib/common.py
in callJavaFunc(sc, func, *args)
121 """ Call Java Function """
122 args = [_py2java(sc, a) for a in args]
--> 123 return _java2py(sc, func(*args))
124
125
/home/user10215193/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py
in call(self, *args) 1131 answer =
self.gateway_client.send_command(command) 1132 return_value
= get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name) 1134 1135 for temp_arg in temp_args:
/home/user10215193/anaconda3/lib/python3.6/site-packages/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
321 raise Py4JError(
322 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 323 format(target_id, ".", name, value))
324 else:
325 raise Py4JError(
Py4JError: An error occurred while calling o19.estimateKernelDensity.
Trace: py4j.Py4JException: Method estimateKernelDensity([class
org.apache.spark.api.java.JavaRDD, class java.lang.Integer, class
java.util.ArrayList]) does not exist at
py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at
py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:214) at
java.lang.Thread.run(Thread.java:748)
I couldn't find anything similar here so if somebody can help me with this I would much appreciate it.
You have to be careful about the types:
bandwidth has to be float
sample has to be RDD[float]
So replace your code with:
kd.setSample(data.map(float))
kd.setBandwidth(3.0)
densities = kd.estimate([-1.0, 2.0, 5.0])
and you'll be fine.