The following code for poker data set has been coded as for to classify the poker data set having 10 features(all numeric) and 10 class label(all numeric). I have used the sklearn's K-NN function in Pyspark with custom distance function. It throws an error while broadcasting K-NN model and predicting the test label. When I do not use a custom function it is not showing any error. Why is this happening?
x=sc.textFile("/home/ritesh/Spark/poker100.txt")
def parseLine(line):
cols = line.split(',') # split the txt file with ','
# label is the last column
label = cols[-1]
# vector is every column, except the label
vector = cols[:-1]
vector = [element for i, element in enumerate(vector) ]
# convert each value from string to float
vector = np.array(vector, dtype=np.float)
vector=vector.tolist()
return (label, vector)
x= x.map(parseLine)
train,test=x.randomSplit([0.7,0.3],seed=100)
train=train.map(lambda x: (x[0], x[1]))
test=test.map(lambda x: (x[0],x[1]))
X=train.map(lambda x: x[1])
#collect traing data
X=X.collect()
Y=train.map(lambda x: x[0])
#collect training label
Y=Y.collect()
y=test.map(lambda x: x[0])
# collect testing label
y=y.collect()
import math
def dist(x,y):#Euc. distance function to calculate distance between training and testing data
return np.sqrt(np.sum((x-y)**2))
import numpy as np
from sklearn.neighbors.ball_tree import BallTree
BallTree.valid_metrics
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3,algorithm='ball_tree', metric= dist)
model=knn.fit(X,Y) # fit KNN model
model=sc.broadcast(model)
testdata=test.map(lambda x: model.value.predict(np.array(x[1],dtype="float64").reshape(1,-1))) #predict test data
y_pred=testdata.collect()
on running it gives error:
Py4JJavaError
Traceback (most recent call last)
<ipython-input-113-a20ddffd3048> in <module>()
1 model=sc.broadcast(model)
2 testdata=test.map(lambda x: model.value.predict(np.array(x[1],dtype="float64").reshape(1,-1)))
----> 3 y_pred=testdata.collect()
/apps/spark-2.4.3/python/pyspark/rdd.py in collect(self)
814 """
815 with SCCallSiteSync(self.context) as css:
--> 816 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
817 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
818
/apps/spark-2.4.3/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/apps/spark-2.4.3/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/apps/spark-2.4.3/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 43.0 failed 1 times, most recent failure: Lost task 1.0 in stage 43.0 (TID 87, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
process()
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-113-a20ddffd3048>", line 2, in <lambda>
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/broadcast.py", line 148, in value
self._value = self.load_from_path(self._path)
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/broadcast.py", line 125, in load_from_path
return self.load(f)
File "/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/broadcast.py", line 131, in load
return pickle.load(file)
AttributeError: Can't get attribute 'dist' on <module 'pyspark.daemon' from '/apps/spark-2.4.3/python/lib/pyspark.zip/pyspark/daemon.py'>
Related
I am running a simple spark job, where in I am querying a table to get 3 columns and 7M rows. I tried various spark configs, but everytime I get Java Heap Space error.
Can someone please help me with this. I am trying to create an ETL process which computes data from 5 tables all of similar size, but I am getting java heap error when I am running the code with only 1 table. I tried to reduce the data volume as well but I still get the same error.
The tables are having >60 columns and Billions of rows of which I am getting only a subset of data for my process.
Please see below the code:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import *
import getpass
spark =SparkSession.builder.getOrCreate()
spark.sparkContext._conf.getAll()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'),
('spark.app.name', 'John Doe'), ('spark.executor.cores', '8'), ('spark.cores.max',
'8'),('spark.driver.memory','15g')])
spark.sparkContext.stop()
spark.sparkContext.stop()
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df=spark.sql("""
SELECT DISTINCT col1
,col2
,col3
from schema.table
where condition1
and condition2
and condition3
and condition4
""")
df.show()
Stacktrace:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480,
in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038,
in send_command
response = connection.send_command(command)
File "/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503,
in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-4-8faeb4b518d0> in <module>
24
25
---> 26 df_upsell.show()
/opt/spark/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
492
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
496 try:
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __call__(self,
*args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
/opt/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
332 format(target_id, ".", name, value))
333 else:
--> 334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o683.showString
Having trouble executing the following code, encountering error with first() command only. If, I run df1.rdd.collect(), it runs without any error:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import urllib
from pyspark.sql.types import StructType,StructField,DoubleType
from pyspark import SparkContext
spark = SparkSession \
.builder \
.getOrCreate()
df = spark.read.csv('subset2009_5000.csv',header=True,inferSchema=True)
df1 = df.select(['Start_Lat','Start_Lon']).dropDuplicates(['Start_Lat','Start_Lon'])
df1.rdd.first() # Fatal Error
I'm getting a fatal error. Running pypspark in local mode. Driver memory is 3GB and subset2009_5000.csv size is 3.3MB.
I don't understand why I'm getting this error with .first() only? What is wrong?
#
# A fatal error has been detected by the Java Runtime Environment:
#
# SIGSEGV (0xb) at pc=0x00007fb4a33c37b4, pid=23702, tid=23966
#
# JRE version: OpenJDK Runtime Environment (11.0.13+8) (build 11.0.13+8-Ubuntu-0ubuntu1.20.04)
# Java VM: OpenJDK 64-Bit Server VM (11.0.13+8-Ubuntu-0ubuntu1.20.04, mixed mode, sharing, tiered, compressed oops, g1 gc, linux-amd64)
# Problematic frame:
# V [libjvm.so+0xe4f7b4]
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport %p %s %c %d %P %E" (or dumping to /home/agam/core.23702)
#
# An error report file with more information is saved as:
# /home/agam/hs_err_pid23702.log
Compiled method (nm) 181573 760 n 0 jdk.internal.misc.Unsafe::getInt (native)
total in heap [0x00007fb48c370a90,0x00007fb48c370e40] = 944
relocation [0x00007fb48c370c08,0x00007fb48c370c38] = 48
main code [0x00007fb48c370c40,0x00007fb48c370e40] = 512
Compiled method (nm) 181573 760 n 0 jdk.internal.misc.Unsafe::getInt (native)
total in heap [0x00007fb48c370a90,0x00007fb48c370e40] = 944
relocation [0x00007fb48c370c08,0x00007fb48c370c38] = 48
main code [0x00007fb48c370c40,0x00007fb48c370e40] = 512
Compiled method (nm) 181576 760 n 0 jdk.internal.misc.Unsafe::getInt (native)
total in heap [0x00007fb48c370a90,0x00007fb48c370e40] = 944
relocation [0x00007fb48c370c08,0x00007fb48c370c38] = 48
main code [0x00007fb48c370c40,0x00007fb48c370e40] = 512
#
# If you would like to submit a bug report, please visit:
# https://bugs.launchpad.net/ubuntu/+source/openjdk-lts
#
----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50652)
Traceback (most recent call last):
File "/usr/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
self.process_request(request, client_address)
File "/usr/lib/python3.8/socketserver.py", line 347, in process_request
self.finish_request(request, client_address)
File "/usr/lib/python3.8/socketserver.py", line 360, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "/usr/lib/python3.8/socketserver.py", line 747, in __init__
self.handle()
File "/home/agam/.local/lib/python3.8/site-packages/pyspark/accumulators.py", line 260, in handle
poll(authenticate_and_accum_updates)
File "/home/agam/.local/lib/python3.8/site-packages/pyspark/accumulators.py", line 235, in poll
if func():
File "/home/agam/.local/lib/python3.8/site-packages/pyspark/accumulators.py", line 256, in authenticate_and_accum_updates
raise ValueError(
ValueError: The value of the provided token to the AccumulatorServer is not correct.
----------------------------------------
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/home/agam/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 480, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/agam/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/home/agam/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 503, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
/tmp/ipykernel_23686/3841479094.py in <module>
1 df1 = df.select(['Start_Lat','Start_Lon']).dropDuplicates(['Start_Lat','Start_Lon'])
----> 2 df1.rdd.first()
~/.local/lib/python3.8/site-packages/pyspark/rdd.py in first(self)
1586 ValueError: RDD is empty
1587 """
-> 1588 rs = self.take(1)
1589 if rs:
1590 return rs[0]
~/.local/lib/python3.8/site-packages/pyspark/rdd.py in take(self, num)
1566
1567 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1568 res = self.context.runJob(self, takeUpToNumLeft, p)
1569
1570 items += res
~/.local/lib/python3.8/site-packages/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
1225 # SparkContext#runJob.
1226 mappedRDD = rdd.mapPartitions(partitionFunc)
-> 1227 sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
1228 return list(_load_from_socket(sock_info, mappedRDD._jrdd_deserializer))
1229
~/.local/lib/python3.8/site-packages/py4j/java_gateway.py in __call__(self, *args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
~/.local/lib/python3.8/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
~/.local/lib/python3.8/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
332 format(target_id, ".", name, value))
333 else:
--> 334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob
I am new to Python spark and I am running the below spark code in the Jupyter notebook and getting AttributeError: 'NoneType' object has no attribute '_jvm'
My spark version is 3.0.1.
from pyspark.sql import functions as func
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator): yield func.sum(iterator)
parallel.mapPartitions(f).collect()
Find below the full error while running the code.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-55-44576a0dc413> in <module>
2 def valueSum(f): return func.sum(f)
3
----> 4 mapp.mapPartitions(valueSum).collect()
5 #one_through_9 = range(1,10)
6 #parallel = sc.parallelize(one_through_9, 3)
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
887 """
888 with SCCallSiteSync(self.context) as css:
--> 889 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
890 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
891
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
~/spark-3.0.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
126 def deco(*a, **kw):
127 try:
--> 128 return f(*a, **kw)
129 except py4j.protocol.Py4JJavaError as e:
130 converted = convert_exception(e.java_exception)
~/spark-3.0.1-bin-hadoop2.7/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 83, 192.168.43.228, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 595, in process
out_iter = func(split_index, iterator)
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/pyspark/rdd.py", line 425, in func
return f(iterator)
File "<ipython-input-55-44576a0dc413>", line 2, in valueSum
File "/home/vijee/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/functions.py", line 68, in _
jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
AttributeError: 'NoneType' object has no attribute '_jvm'
func.sum is for use with dataframes, not for summing numbers. Use the Python sum function instead:
one_through_9 = range(1,10)
parallel = sc.parallelize(one_through_9, 3)
def f(iterator):
yield sum(iterator)
parallel.mapPartitions(f).collect()
which will give [6, 15, 24].
This is the case of using Keras ImageDataGenerator with .flow_from_directory, wrapping it with tf.data.Dataset.from_generator(...). The dataset failed in any attempt to iterate through it.
Error summary:
InvalidArgumentError: TypeError: endswith first arg must be bytes or a tuple of bytes, not str
Code snippet:
import tensorflow as tf # version 2.1.0
DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz'
flowers_root_path = tf.keras.utils.get_file(origin=DATA_URL, fname='flower_photos', untar=True)
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, rotation_range=20)
gen = img_gen.flow_from_directory(flowers_root_path)
ds = tf.data.Dataset.from_generator(
# lambda: gen, # this works
img_gen.flow_from_directory, args=[flowers_root_path], # this failed.
output_types=(tf.float32, tf.float32),
output_shapes=([32,256,256,3], [32,5])
)
it = iter(ds)
batch = next(it)
print(batch)
Using "lambda: gen" looks ok. Any idea why?
Full Stack trace:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/context.py in execution_mode(mode)
1896 ctx.executor = executor_new
-> 1897 yield
1898 finally:
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/iterator_ops.py in _next_internal(self)
658 output_types=self._flat_output_types,
--> 659 output_shapes=self._flat_output_shapes)
660
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/gen_dataset_ops.py in iterator_get_next_sync(iterator, output_types, output_shapes, name)
2478 except _core._NotOkStatusException as e:
-> 2479 _ops.raise_from_not_ok_status(e, name)
2480 # Add nodes to the TensorFlow graph.
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/ops.py in raise_from_not_ok_status(e, name)
6605 # pylint: disable=protected-access
-> 6606 six.raise_from(core._status_to_exception(e.code, message), None)
6607 # pylint: enable=protected-access
/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value, from_value)
InvalidArgumentError: TypeError: endswith first arg must be bytes or a tuple of bytes, not str
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 673, in get_iterator
return self._iterators[iterator_id]
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/script_ops.py", line 236, in __call__
ret = func(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 789, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 675, in get_iterator
iterator = iter(self._generator(*self._args.pop(iterator_id)))
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/image_data_generator.py", line 540, in flow_from_directory
interpolation=interpolation
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/directory_iterator.py", line 126, in __init__
classes, filenames = res.get()
File "/usr/lib/python3.6/multiprocessing/pool.py", line 644, in get
raise self._value
File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/utils.py", line 216, in _list_valid_filenames_in_directory
for root, fname in valid_files:
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/utils.py", line 172, in _iter_valid_files
if fname.lower().endswith('.tiff'):
TypeError: endswith first arg must be bytes or a tuple of bytes, not str
[[{{node PyFunc}}]] [Op:IteratorGetNextSync]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-56-a2623f5ab104> in <module>()
1 it = iter(ds)
----> 2 batch = next(it)
3 print(batch)
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/iterator_ops.py in __next__(self)
628
629 def __next__(self): # For Python 3 compatibility
--> 630 return self.next()
631
632 def _next_internal(self):
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/iterator_ops.py in next(self)
672 """Returns a nested structure of `Tensor`s containing the next element."""
673 try:
--> 674 return self._next_internal()
675 except errors.OutOfRangeError:
676 raise StopIteration
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/iterator_ops.py in _next_internal(self)
663 return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access
664 except AttributeError:
--> 665 return structure.from_compatible_tensor_list(self._element_spec, ret)
666
667 #property
/usr/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
97 value = type()
98 try:
---> 99 self.gen.throw(type, value, traceback)
100 except StopIteration as exc:
101 # Suppress StopIteration *unless* it's the same exception that
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/context.py in execution_mode(mode)
1898 finally:
1899 ctx.executor = executor_old
-> 1900 executor_new.wait()
1901
1902
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/executor.py in wait(self)
65 def wait(self):
66 """Waits for ops dispatched in this executor to finish."""
---> 67 pywrap_tensorflow.TFE_ExecutorWaitForAllPendingNodes(self._handle)
68
69 def clear_error(self):
InvalidArgumentError: TypeError: endswith first arg must be bytes or a tuple of bytes, not str
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 673, in get_iterator
return self._iterators[iterator_id]
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/script_ops.py", line 236, in __call__
ret = func(*args)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 789, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/data/ops/dataset_ops.py", line 675, in get_iterator
iterator = iter(self._generator(*self._args.pop(iterator_id)))
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/image_data_generator.py", line 540, in flow_from_directory
interpolation=interpolation
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/directory_iterator.py", line 126, in __init__
classes, filenames = res.get()
File "/usr/lib/python3.6/multiprocessing/pool.py", line 644, in get
raise self._value
File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/utils.py", line 216, in _list_valid_filenames_in_directory
for root, fname in valid_files:
File "/usr/local/lib/python3.6/dist-packages/keras_preprocessing/image/utils.py", line 172, in _iter_valid_files
if fname.lower().endswith('.tiff'):
TypeError: endswith first arg must be bytes or a tuple of bytes, not str
[[{{node PyFunc}}]]
As per this Stack Overflow Answer, you can make your code to work properly by replacing
gen = img_gen.flow_from_directory(flowers_root_path)
with
def Gen():
gen = img_gen.flow_from_directory(flowers_root_path)
for (x,y) in gen:
yield (x,y)
Complete working code is shown below:
import tensorflow as tf # version 2.1.0
DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz'
flowers_root_path = tf.keras.utils.get_file(origin=DATA_URL, fname='flower_photos', untar=True)
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, rotation_range=20)
def Gen():
gen = img_gen.flow_from_directory(flowers_root_path)
for (x,y) in gen:
yield (x,y)
ds = tf.data.Dataset.from_generator(
Gen, output_types=(tf.float32, tf.float32), output_shapes=([32,256,256,3], [32,5]))
it = iter(ds)
batch = next(it)
print(batch)
Also, please find the Github Gist with the working code.
I have the following rdd named my_rdd, which looks like:
[FreqSequence(sequence=[['John']], freq=18980),
FreqSequence(sequence=[['Mary']], freq=106),
FreqSequence(sequence=[['John-Mary']], freq=381),
FreqSequence(sequence=[['John-Ann']], freq=158),
FreqSequence(sequence=[['Ann']], freq=433)]
I then tried to sort it like below:
new_rdd = my_rdd.sortBy(lambda x: x.freq)
new_rdd.take(5)
but got the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-15-94c1babd943f> in <module>()
1 print(my_rdd.take(5))
2 new_rdd = my_rdd.sortBy(lambda x: x.freq)
----> 3 new_rdd.take(5)
/usr/local/spark-latest/python/pyspark/rdd.py in take(self, num)
1341
1342 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1343 res = self.context.runJob(self, takeUpToNumLeft, p)
1344
1345 items += res
/usr/local/spark-latest/python/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
963 # SparkContext#runJob.
964 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 965 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
966 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
967
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/usr/local/spark-latest/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/spark-latest/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 65.0 failed 4 times, most recent failure: Lost task 0.3 in stage 65.0 (TID 115, ph-hdp-inv-dn01, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/worker.py", line 163, in main
func, profiler, deserializer, serializer = read_command(pickleSer, infile)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/worker.py", line 54, in read_command
command = serializer._read_with_length(file)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
return self.loads(obj)
File "/data/0/yarn/nm/usercache/phanalytics-test/appcache/application_1489740042194_0048/container_e20_1489740042194_0048_01_000002/pyspark.zip/pyspark/serializers.py", line 431, in loads
return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'UserString'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Any idea what was wrong here? Thanks!
Your code is correct. Your error:
ImportError: No module named 'UserString'
is raised because UserString is no longer a module in in Python 3.x , but it is a part of the collections modules. This suggest that you either are using an outdated version of PySpark or one of its dependencies is outdated.