Facing issue with integrating code with Aws glue code, ray and pyspark - python-3.x

I am facing the following exception tries various ways but not resolved.
It gives the exception in parallel distributed computing processing using ray library Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
Traceback (most recent call last):
File "etl_engine_ray.py", line 148, in <module>
print(perform_etl(**requested_data))
File "etl_engine_ray.py", line 138, in perform_etl
futures = [process_etl.remote(each, uid, integration_id, time_stamp) for each in data]
File "etl_engine_ray.py", line 138, in <listcomp>
futures = [process_etl.remote(each, uid, integration_id, time_stamp) for each in data]
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/remote_function.py", line 124, in _remote_proxy
return self._remote(args=args, kwargs=kwargs)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 295, in _invocation_remote_span
return method(self, args, kwargs, *_args, **_kwargs)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/remote_function.py", line 263, in _remote
self._pickled_function = pickle.dumps(self._function)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 620, in dump
return Pickler.dump(self, obj)
File "/home/glue_user/spark/python/pyspark/context.py", line 362, in __getnewargs__
"It appears that you are attempting to reference SparkContext from a broadcast "
Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
from pyspark import SparkContext
from awsglue.context import GlueContext
from awsglue.transforms import SelectFields
import ray
import settings
sc = SparkContext.getOrCreate()
glue_context = GlueContext(sc)
#ray.remote
def process_etl(path:str, uid: str, integration_id: str, time_stamp: int):
try:
dynamic_df = glue_context.create_dynamic_frame_from_options(
connection_type = settings.CONNECTION_TYPE,
connection_options={
'paths':[path],
'recurse':True,
'groupFiles': settings.S3_GROUP_FILES,
'groupSize': settings.S3_GROUP_SIZE},
format='json',
format_options={"jsonPath": "*"}
)
# select only those column name that required
selected_data = SelectFields.apply(
frame = dynamic_df,
paths=['partner_script_id', 'details', 'registered_installation_id', 'type']
)
# Create file format
file_name = os.path.basename(path).split('.')[0]
parquet_path = f'{settings.S3_BUCKET_PATH}/{integration_id}/{uid}/{time_stamp}/{file_name}.parquet'
# If pipeline available then use custom pipeline
if file_name in settings.CUSTOM_ETL_PIPELINE:
selected_data = settings.CUSTOM_ETL_PIPELINE.get(file_name)(selected_data)
# Wtie data into bucket in parquet format
glue_context.write_dynamic_frame_from_options(
selected_data,
connection_type=settings.CONNECTION_TYPE,
connection_options={'path': parquet_path},
format='parquet',
format_options = {
"compression": "snappy",
'blockSize': settings.BLOCK_SIZE,
'pageSize': settings.PAGE_SIZE}
)
except Exception as error:
print(f'Exception in perform_etl is {error}')
return parquet_path
def perform_etl(uid: str, integration_id: str, type: str, data: list) -> dict:
time_stamp = int(time.time())
futures = [process_etl.remote(each, uid, integration_id, time_stamp) for each in data]
# a = sc.parallelize(data)
# d = a.map(lambda each: process_etl.remote(each, uid, integration_id, time_stamp)).collect()
# print(d)
final_data = ray.get(futures)
print(time.time() - start_time)
return final_data
if __name__ == '__main__':
print(perform_etl(**requested_data))
I have done lots of R and D but still have not found any root cause of it. Its not resolved please help me out with this.

Related

Using Faker with PySpark Dataframe to Anonymise Data

I am trying to change a few columns in my Spark DataFrame, I have a few columns like :
First Name
Last Name
Email
I want to anonymise this and generate meaningful values for which am using Faker.
But if i use
df.withColumn('FirstName', lit(fake.first_name()))
It adds the same name for all rows , something like :
As you can see it has the same value for each first name, ideally i would like to have different faker value and not a constant. How would I achieve this ?
Update 1 :
I looked at Steven's suggestion and here is my updated code
import pyspark.sql.functions as sf
from faker import Faker
from pyspark.sql import functions as F
MSG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(format=MSG_FORMAT, datefmt=DATETIME_FORMAT)
logger = logging.getLogger("[SFDC-GLUE-LOG]")
fake = Faker()
source_df = spark.read.format("jdbc").option("url",connection_url).option("query",query).option("driver", driver_name).option("user", user_name).option("password", password).option("StmtCallLimit",0).load()
fake_firstname = F.udf(fake.first_name)
masked_df=source_df.withColumn("FirstName", fake_firstname())
Now i Get
Traceback (most recent call last):
File "script_2020-08-05-17-15-26.py", line 52, in <module>
masked_df=source_df.withColumn("FirstName", fake_firstname())
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 189, in wrapper
return self(*args)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 167, in __call__
judf = self._judf
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 151, in _judf
self._judf_placeholder = self._create_judf()
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 160, in _create_judf
wrapped_func = _wrap_function(sc, self.func, self.returnType)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 35, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/rdd.py", line 2420, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/serializers.py", line 600, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: TypeError: can't pickle weakref objects
you need to use an UDF for that :
from pyspark.sql import functions as F
fake_firstname = F.udf(fake.first_name)
df.withColumn("FirstName", fake_firstname())
I had the same problem, follow the solution that worked for me.
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from faker import Factory
def fake_name():
faker = Factory.create()
return faker.name()
fake_name_udf = udf(fake_name, StringType())
df = df.withColumn('name', fake_name_udf())

pandas_udf in pyspark3.0.0 get unexpected wrong

I followed the example in spark website, but it went wrong, my code is in below:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.pandas.functions import pandas_udf
class SparkBase(object):
def __init__(self, master="local[*]", app_name="SparkBase"):
_conf = SparkConf().setMaster(master).setAppName(app_name)
_conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
_conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", True)
self.sc = SparkContext().getOrCreate(conf=_conf)
self.spark = SparkSession.builder.config(conf=_conf).enableHiveSupport().getOrCreate()
#pandas_udf("col1 string, col2 long")
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
s3["col2"] = s1 + s2.str.len()
return s3
if __name__ == "__main__":
spark_base = SparkBase()
df = spark_base.spark.createDataFrame([[1, "a string", ("a nested string",)]],
"long_c long, str_c string, struct_c struct<col1: string>")
df.show()
the error code:
Traceback (most recent call last):
File "F:/otherproj/localpyspark/pyspark3/sparkbase.py", line 24, in <module>
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\pandas\functions.py", line 426, in _create_pandas_udf
return _create_udf(f, returnType, evalType)
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\udf.py", line 43, in _create_udf
return udf_obj._wrapped()
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\udf.py", line 204, in _wrapped
wrapper.returnType = self.returnType
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\udf.py", line 94, in returnType
self._returnType_placeholder = _parse_datatype_string(self._returnType)
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\types.py", line 822, in _parse_datatype_string
raise e
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\types.py", line 812, in _parse_datatype_string
return from_ddl_schema(s)
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\types.py", line 804, in from_ddl_schema
sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json())
AttributeError: 'NoneType' object has no attribute '_jvm'
And if I annotate the func function, it can run successfully. Where is wrong with it? Is it a bug in spark3.0.0?

Azure databricks: KafkaUtils createDirectStream causes Py4JNetworkError("Answer from Java side is empty") error

In Azure databricks, I tried to create a kafka stream in notebook and used it to create a spark
job. Databricks throw error at the line KafkaUtils.createDirectStream(). Attached the correponding code below.
from kazoo.client import KazooClient
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
sc = spark.sparkContext
ssc = StreamingContext(sc, 30)
print('SSC created:: {}'.format(ssc))
zk = KazooClient(hosts=kafka_host)
print(kafka_host)
zk.start()
_offset_directory = "/" + topic + "/" + "DA_DAINT" + "/partitions"
print(_offset_directory)
if zk.exists(_offset_directory):
partitions = zk.get_children(_offset_directory)
print(partitions)
partition_offsets_dict = {}
for partition in partitions:
offset, stat = zk.get((_offset_directory + '/' + partition))
partition_offsets_dict[partition] = offset.decode()
print(partition_offsets_dict)
from_offset = {}
for _partition in partitions:
offset = partition_offsets_dict[_partition]
topic_partition = TopicAndPartition(topic, int(_partition))
from_offset[topic_partition] = int(offset)
print(from_offset)
print("\nCreate kafka direct stream ...")
kafka_stream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": broker_list},
fromOffsets=from_offset)
Attaching the error stack traces.
Traceback (most recent call last):
File "/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
response = connection.send_command(command)
File "/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
An error occurred while calling
o581.createTopicAndPartition Traceback (most recent call last):
File "<command-3832551107104577>", line 77, in <module> fromOffsets=from_offset)
File "/databricks/spark/python/pyspark/streaming/kafka.py", line 141, in createDirectStream v) for (k, v) in fromOffsets.items()])
File "/databricks/spark/python/pyspark/streaming/kafka.py", line 141, in <listcomp> v) for (k, v) in fromOffsets.items()])
File "/databricks/spark/python/pyspark/streaming/kafka.py",
line 314, in _jTopicAndPartition return helper.createTopicAndPartition(self._topic, self._partition)
File "/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py",
line 1257, in __call__ answer, self.gateway_client, self.target_id, self.name)
File "/databricks/spark/python/pyspark/sql/utils.py",
line 63, in deco return f(*a, **kw)
File "/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 336, in get_return_value format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o581.createTopicAndPartition
In Azure databricks, when using Kafka stream in python notebook, I have installed kafka-python and org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.1 libraries and added them as a dependencies to the spark-job in databricks.
Note 1:
Also i am able to receive data from Kafka when i use simple kafka consumer in databricks notebook.
from kafka import KafkaConsumer
if __name__ == "__main__":
consumer_ = KafkaConsumer(group_id='test', bootstrap_servers=['my_kafka_server:9092'])
print(consumer_.topics())
consumer_.subscribe(topics=['dev_test'])
for m in consumer_:
print(m)
The problem arises only, if i try to create Kafka direct stream using KafkaUtils.createDirectStream() in azure databricks python notebook.
Another minimal set of code for reproducing this issue,
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext
broker = "broker:9092"
topic = "dev_topic"
sc = spark.sparkContext
ssc = StreamingContext(sc, 30)
dks = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": broker})
print("Direct stream created...")
parsed = dks.map(lambda v: v[1])
summary_dstream = parsed.count().map(lambda x: 'Words in this batch: %s' % x)
print(summary_dstream)
NOTE 2:
Kafka version: 0.10
Scala version: 2.11
Spark version: 2.4.3
Still i am unable to get the root cause of the issue.
But using the jar org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.4.3 fixed the issue.
UPDATE 1:
Got the following update from microsoft support team:
Below is the update from databricks engineering.
We see the customer
is using DStreams API
(https://learn.microsoft.com/en-us/azure/databricks/spark/latest/rdd-streaming/)
which is outdated and we don't support it anymore. Also, we strongly recommend them switch
to Structured Streaming, you can follow this doc for doing it -
https://learn.microsoft.com/en-us/azure/databricks/spark/latest/structured-streaming/kafka

Unable to import the CustomOperator defined in a python script in plugins folder

I am trying to write a custom operator and sensor in apache-airflow.
It basically has 3 operators and 1 sensor the first operator/task will call some python method and print on the console some message. After that 2nd operator will be called which is a custom operator placed inside a plugins folder in a file named "custom_operator.py". which will insert the data in mongo db database. Then a the custom sensor is called which is using mongo_hook which will monitor the db ans check for the value in db. It is also inside the same file custom_operator.py inside plugins. After this a simple python operator is called.
I have already tried:
Can't import Airflow plugins
```
home/autotest/airflow/dags/custom_dag1.py
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import date_time, timedelta
from airflow.operators import InsertDb
from airflow.operators import DbSensor
log = logging.getLogger(__name__)
defaultArgs = {
enter code here'owner': 'mohit_saumik',
'depends_on_past': False,
'start_date': date_time(2019,04,11,10,21,23)
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1)
}
# creating first operator which will print on the console.
def print_operator_one():
log.info("Operator One is executed.")
return "Operator One is executed and returned"
# Creating third operator which will print on the console.
def print_operator_third():
log.info("Operator three is executed")
return "Operator two is executed and returned"
# Creating DAG
dag = DAG('custom_dag', default_args = defaultArgs, schedule_interval=timedelta(minutes=10))
# Creating task 1
operator_one_task = PythonOperator(task_id="task_1", python_callable="print_operator_one", dag=dag)
# Creating task 2
operator_two_task = InsertDb(my_operator_param="This is custom Operator", task_id="task_2", dag=dag)
# Creating Task 3
sensor_one_task = DbSensor(task_id="task_3", poke_interval=10, dag=dag, collection="demoCollection", query={"key1": "value1"})
# Creating task 4
operator_three_task = PythonOperator(task_id="task_4", python_callable="print_operator_third", dag=dag)
# Creating flow
operator_one_task >> operator_two_task >> sensor_one_task >> operator_three_task
```
home/autotest/airflow/plugins/custom_operator.py
import logging
from airflow.models import BaseOperator
from airflow.plugins_manager import AirflowPlugin
from airflow.utils.decorator import apply_defaults
from airflow.contrib.hooks.mongo_hook import MongoHook
from airflow.operators.sensors import BaseSensorOperator
from datetime import datetime
log = logging.getLogger(__name__)
class InsertDb(BaseOperator):
#apply_defaults
def __init__(self, my_operator_param, *args, **kwargs):
self.operator_param = my_operator_param
super(InsertDb, self).__init__(*args, **kwargs)
def execute(self, context):
log.info("Inserting into the DB!")
db_hook = MongoHook(self, conn_id="https://localhost,localhost:27017/mydb")
db_conn = db_hook.get_conn()
insertSuccess = db_conn.insert_one(mongo_collection="demoCollection",doc = {"key1": "value1"}, mongo_db="mydb" )
log.info(insertSuccess)
class DbSensor(BaseSensorOperator):
#apply_defaults
def __init__(self, collection, query, mongo_conn_id="mongo_default", *args, **kwargs):
super(DbSensor,self).__init__(*args,**kwargs)
def poke(self,context):
db_hook = MongoHook(self, conn_id="https://localhost,localhost:27017/mydb")
db_conn = db_hook.get_conn()
result = db_conn.find(mongo_collection=collection, query=query, mongodb="mydb")
if result is None:
log.info("Data not available in DB")
return False
else:
log.info("Data is available in DB")
return True
class DbPlugin(AirflowPlugin):
name = "db_plugin"
operators = [InsertDb, DbSensor]
I am not able to launch the webserver.
Getting the errors:
[2019-04-12 12:35:16,046] {models.py:377} ERROR - Failed to import: /home/autotest/airflow/dags/custom_dag1.py
Traceback (most recent call last):
File "/home/autotest/virtualenv/airflow/lib/python3.6/site-packages/airflow/models.py", line 374, in process_file
m = imp.load_source(mod_name, filepath)
File "/home/autotest/virtualenv/airflow/lib/python3.6/imp.py", line 172, in load_source
module = _load(spec)
File "<frozen importlib._bootstrap>", line 684, in _load
File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 678, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/home/autotest/airflow/dags/custom_dag1.py", line 41, in <module>
operator_one_task = PythonOperator(task_id="task_1",python_callable="print_operator_one", dag=dag)
File "/home/autotest/virtualenv/airflow/lib/python3.6/site-packages/airflow/utils/decorators.py", line 98, in wrapper
result = func(*args, **kwargs)
File "/home/autotest/virtualenv/airflow/lib/python3.6/site-packages/airflow/operators/python_operator.py", line 81, in __init__
raise AirflowException('`python_callable` param must be callable')
airflow.exceptions.AirflowException: `python_callable` param must be callable
Do it without the quotes: python_callable=print_operator_third. This way you are passing a callable instead of a string.

How to pass deep learning model data to map function in Spark

I have a very simple use-case where I am reading large number of images as rdd from s3 using sc.binaryFiles method. Once this RDD is created I am passing the content inside the rdd to the vgg16 feature extractor function. So, in this I will need the model data using which the feature extraction will be done, so I am putting the model data into broadcast variable and then accesing the value in each map function. Below is the code:-
s3_files_rdd = sc.binaryFiles(RESOLVED_IMAGE_PATH)
s3_files_rdd.persist()
model_data = initVGG16()
broadcast_model = sc.broadcast(model_data)
features_rdd = s3_files_rdd.mapPartitions(extract_features_)
response_rdd = features_rdd.map(lambda x: (x[0], write_to_s3(x, OUTPUT, FORMAT_NAME)))
extract_features_ method:-
def extract_features_(xs):
model_data = initVGG16()
for k, v in xs:
yield k, extract_features2(model_data,v)
extract_features method:-
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.models import Model
from io import BytesIO
from keras.applications.vgg16 import preprocess_input
def extract_features(model,obj):
try:
print('executing vgg16 feature extractor...')
img = image.load_img(BytesIO(obj), target_size=(224, 224,3))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)
vgg16_feature = model.predict(img_data)[0]
print('++++++++++++++++++++++++++++',vgg16_feature.shape)
return vgg16_feature
except Exception as e:
print('Error......{}'.format(e.args))
return []
write to s3 method:-
def write_to_s3(rdd, output_path, format_name):
file_path = rdd[0]
file_name_without_ext = get_file_name_without_ext(file_name)
bucket_name = output_path.split('/', 1)[0]
final_path = 'deepak' + '/' + file_name_without_ext + '.' + format_name
LOGGER.info("Saving to S3....")
cci = cc.get_interface(bucket_name, ACCESS_KEY=os.environ.get("AWS_ACCESS_KEY_ID"),
SECRET_KEY=os.environ.get("AWS_SECRET_ACCESS_KEY"), endpoint_url='https://s3.amazonaws.com')
response = cci.upload_npy_array(final_path, rdd[1])
return response
Inside the write_to_s3 method I am getting the RDD, extracting the key name to be saved and bucket. then using a library called cottoncandy to drectly save the RDD content which is numpy array in my case instead of saving any intermediate file.
I am getting below error :-
127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 600, in save_reduce
save(state)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects
Traceback (most recent call last):
File "one_file5.py", line 98, in <module>
run()
File "one_file5.py", line 89, in run
LOGGER.info('features_rdd rdd created,...... %s',features_rdd.count())
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 1041, in count
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 1032, in sum
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 906, in fold
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 809, in collect
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 2455, in _jrdd
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 2388, in _wrap_function
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/rdd.py", line 2374, in _prepare_for_python_RDD
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/serializers.py", line 464, in dumps
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 704, in dumps
File "/mnt/yarn/usercache/hadoop/appcache/application_1541576150127_0010/container_1541576150127_0010_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 162, in dump
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects.
When I am commenting out the the code part of features_rdd, then the program runs fine which means something is not proper in the features_rdd part. Not sure what I am doing wrong here.
I am running the program in AWS EMR, with 4 executors.
executor core 7
executor RAM 8GB
Spark version 2.2.1
Replace your current code with mapPartitions:
def extract_features_(xs):
model_data = initVGG16()
for k, v in xs:
yield k, extract_features(model_data, v)
features_rdd = s3_files_rdd.mapPartitions(extract_features_)

Resources