pandas_udf in pyspark3.0.0 get unexpected wrong - python-3.x

I followed the example in spark website, but it went wrong, my code is in below:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.pandas.functions import pandas_udf
class SparkBase(object):
def __init__(self, master="local[*]", app_name="SparkBase"):
_conf = SparkConf().setMaster(master).setAppName(app_name)
_conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
_conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", True)
self.sc = SparkContext().getOrCreate(conf=_conf)
self.spark = SparkSession.builder.config(conf=_conf).enableHiveSupport().getOrCreate()
#pandas_udf("col1 string, col2 long")
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
s3["col2"] = s1 + s2.str.len()
return s3
if __name__ == "__main__":
spark_base = SparkBase()
df = spark_base.spark.createDataFrame([[1, "a string", ("a nested string",)]],
"long_c long, str_c string, struct_c struct<col1: string>")
df.show()
the error code:
Traceback (most recent call last):
File "F:/otherproj/localpyspark/pyspark3/sparkbase.py", line 24, in <module>
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\pandas\functions.py", line 426, in _create_pandas_udf
return _create_udf(f, returnType, evalType)
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\udf.py", line 43, in _create_udf
return udf_obj._wrapped()
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\udf.py", line 204, in _wrapped
wrapper.returnType = self.returnType
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\udf.py", line 94, in returnType
self._returnType_placeholder = _parse_datatype_string(self._returnType)
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\types.py", line 822, in _parse_datatype_string
raise e
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\types.py", line 812, in _parse_datatype_string
return from_ddl_schema(s)
File "D:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\types.py", line 804, in from_ddl_schema
sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json())
AttributeError: 'NoneType' object has no attribute '_jvm'
And if I annotate the func function, it can run successfully. Where is wrong with it? Is it a bug in spark3.0.0?

Related

Facing issue with integrating code with Aws glue code, ray and pyspark

I am facing the following exception tries various ways but not resolved.
It gives the exception in parallel distributed computing processing using ray library Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
Traceback (most recent call last):
File "etl_engine_ray.py", line 148, in <module>
print(perform_etl(**requested_data))
File "etl_engine_ray.py", line 138, in perform_etl
futures = [process_etl.remote(each, uid, integration_id, time_stamp) for each in data]
File "etl_engine_ray.py", line 138, in <listcomp>
futures = [process_etl.remote(each, uid, integration_id, time_stamp) for each in data]
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/remote_function.py", line 124, in _remote_proxy
return self._remote(args=args, kwargs=kwargs)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 295, in _invocation_remote_span
return method(self, args, kwargs, *_args, **_kwargs)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/remote_function.py", line 263, in _remote
self._pickled_function = pickle.dumps(self._function)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/home/glue_user/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 620, in dump
return Pickler.dump(self, obj)
File "/home/glue_user/spark/python/pyspark/context.py", line 362, in __getnewargs__
"It appears that you are attempting to reference SparkContext from a broadcast "
Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
from pyspark import SparkContext
from awsglue.context import GlueContext
from awsglue.transforms import SelectFields
import ray
import settings
sc = SparkContext.getOrCreate()
glue_context = GlueContext(sc)
#ray.remote
def process_etl(path:str, uid: str, integration_id: str, time_stamp: int):
try:
dynamic_df = glue_context.create_dynamic_frame_from_options(
connection_type = settings.CONNECTION_TYPE,
connection_options={
'paths':[path],
'recurse':True,
'groupFiles': settings.S3_GROUP_FILES,
'groupSize': settings.S3_GROUP_SIZE},
format='json',
format_options={"jsonPath": "*"}
)
# select only those column name that required
selected_data = SelectFields.apply(
frame = dynamic_df,
paths=['partner_script_id', 'details', 'registered_installation_id', 'type']
)
# Create file format
file_name = os.path.basename(path).split('.')[0]
parquet_path = f'{settings.S3_BUCKET_PATH}/{integration_id}/{uid}/{time_stamp}/{file_name}.parquet'
# If pipeline available then use custom pipeline
if file_name in settings.CUSTOM_ETL_PIPELINE:
selected_data = settings.CUSTOM_ETL_PIPELINE.get(file_name)(selected_data)
# Wtie data into bucket in parquet format
glue_context.write_dynamic_frame_from_options(
selected_data,
connection_type=settings.CONNECTION_TYPE,
connection_options={'path': parquet_path},
format='parquet',
format_options = {
"compression": "snappy",
'blockSize': settings.BLOCK_SIZE,
'pageSize': settings.PAGE_SIZE}
)
except Exception as error:
print(f'Exception in perform_etl is {error}')
return parquet_path
def perform_etl(uid: str, integration_id: str, type: str, data: list) -> dict:
time_stamp = int(time.time())
futures = [process_etl.remote(each, uid, integration_id, time_stamp) for each in data]
# a = sc.parallelize(data)
# d = a.map(lambda each: process_etl.remote(each, uid, integration_id, time_stamp)).collect()
# print(d)
final_data = ray.get(futures)
print(time.time() - start_time)
return final_data
if __name__ == '__main__':
print(perform_etl(**requested_data))
I have done lots of R and D but still have not found any root cause of it. Its not resolved please help me out with this.

python3.8 multiprocessing Pool Can't pickle function: attribute lookup getExcelData on __main__ failed when using pandas Dataframe

Trying to read CSV files into pandas data frames using multiprocessing but get pickle error.
python 3.8.8
pandas 1.2.4
import os
import pandas as PD
import time
from multiprocessing import Pool
def getExcelData(fn):
data = pd.DataFrame()
return data.append(pd.read_csv(fn), sort=False)
if __name__ == "__main__":
dir = '.'
fn_ls = [ f'{fn}' for fn in os.listdir(dir) if fn.endswith('test.csv') ]
startTime = time.time()
pool = Pool(2)
pool_data_list = []
data = pd.DataFrame()
for file_name in fn_ls:
pool_data_list.append(pool.apply_async(getExcelData, (os.path.join(dir, file_name),)))
pool.close()
pool.join()
for pool_data in pool_data_list:
data = data.append(pool_data.get())
res_ls = []
for pool_data in pool_data_list:
res_ls = pool_data.get()
endTime = time.time()
print(endTime - startTime)
print(len(data))
Traceback (most recent call last):
File "/Users/cxx/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 1, in
runfile('/Users/cxx/xiaoxi/18_Mercury/raw_data/raw/5000bp/test/test.py', wdir='/Users/cxx/xiaoxi/18_Mercury/raw_data/raw/5000bp/test')
File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_bundle/pydev_umd.py", line 198, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/cxx/xiaoxi/18_Mercury/raw_data/raw/5000bp/test/test.py", line 33, in
data = data.append(pool_data.get())
File "/Users/cxx/opt/anaconda3/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/Users/cxx/opt/anaconda3/lib/python3.8/multiprocessing/pool.py", line 537, in _handle_tasks
put(task)
File "/Users/cxx/opt/anaconda3/lib/python3.8/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/Users/cxx/opt/anaconda3/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function getExcelData at 0x7f84e9ad19d0>: attribute lookup getExcelData on main failed
Replace everything between startTime and endTime with a simple map call in a context manager:
with Pool(2) as pool:
data = [df for df in pool.imap(getExcelData, fn_ls)]

Using Faker with PySpark Dataframe to Anonymise Data

I am trying to change a few columns in my Spark DataFrame, I have a few columns like :
First Name
Last Name
Email
I want to anonymise this and generate meaningful values for which am using Faker.
But if i use
df.withColumn('FirstName', lit(fake.first_name()))
It adds the same name for all rows , something like :
As you can see it has the same value for each first name, ideally i would like to have different faker value and not a constant. How would I achieve this ?
Update 1 :
I looked at Steven's suggestion and here is my updated code
import pyspark.sql.functions as sf
from faker import Faker
from pyspark.sql import functions as F
MSG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(format=MSG_FORMAT, datefmt=DATETIME_FORMAT)
logger = logging.getLogger("[SFDC-GLUE-LOG]")
fake = Faker()
source_df = spark.read.format("jdbc").option("url",connection_url).option("query",query).option("driver", driver_name).option("user", user_name).option("password", password).option("StmtCallLimit",0).load()
fake_firstname = F.udf(fake.first_name)
masked_df=source_df.withColumn("FirstName", fake_firstname())
Now i Get
Traceback (most recent call last):
File "script_2020-08-05-17-15-26.py", line 52, in <module>
masked_df=source_df.withColumn("FirstName", fake_firstname())
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 189, in wrapper
return self(*args)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 167, in __call__
judf = self._judf
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 151, in _judf
self._judf_placeholder = self._create_judf()
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 160, in _create_judf
wrapped_func = _wrap_function(sc, self.func, self.returnType)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/sql/udf.py", line 35, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/rdd.py", line 2420, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/mnt/yarn/usercache/root/appcache/application_1596647211940_0002/container_1596647211940_0002_01_000001/pyspark.zip/pyspark/serializers.py", line 600, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: TypeError: can't pickle weakref objects
you need to use an UDF for that :
from pyspark.sql import functions as F
fake_firstname = F.udf(fake.first_name)
df.withColumn("FirstName", fake_firstname())
I had the same problem, follow the solution that worked for me.
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from faker import Factory
def fake_name():
faker = Factory.create()
return faker.name()
fake_name_udf = udf(fake_name, StringType())
df = df.withColumn('name', fake_name_udf())

Unable to import the CustomOperator defined in a python script in plugins folder

I am trying to write a custom operator and sensor in apache-airflow.
It basically has 3 operators and 1 sensor the first operator/task will call some python method and print on the console some message. After that 2nd operator will be called which is a custom operator placed inside a plugins folder in a file named "custom_operator.py". which will insert the data in mongo db database. Then a the custom sensor is called which is using mongo_hook which will monitor the db ans check for the value in db. It is also inside the same file custom_operator.py inside plugins. After this a simple python operator is called.
I have already tried:
Can't import Airflow plugins
```
home/autotest/airflow/dags/custom_dag1.py
import logging
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import date_time, timedelta
from airflow.operators import InsertDb
from airflow.operators import DbSensor
log = logging.getLogger(__name__)
defaultArgs = {
enter code here'owner': 'mohit_saumik',
'depends_on_past': False,
'start_date': date_time(2019,04,11,10,21,23)
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=1)
}
# creating first operator which will print on the console.
def print_operator_one():
log.info("Operator One is executed.")
return "Operator One is executed and returned"
# Creating third operator which will print on the console.
def print_operator_third():
log.info("Operator three is executed")
return "Operator two is executed and returned"
# Creating DAG
dag = DAG('custom_dag', default_args = defaultArgs, schedule_interval=timedelta(minutes=10))
# Creating task 1
operator_one_task = PythonOperator(task_id="task_1", python_callable="print_operator_one", dag=dag)
# Creating task 2
operator_two_task = InsertDb(my_operator_param="This is custom Operator", task_id="task_2", dag=dag)
# Creating Task 3
sensor_one_task = DbSensor(task_id="task_3", poke_interval=10, dag=dag, collection="demoCollection", query={"key1": "value1"})
# Creating task 4
operator_three_task = PythonOperator(task_id="task_4", python_callable="print_operator_third", dag=dag)
# Creating flow
operator_one_task >> operator_two_task >> sensor_one_task >> operator_three_task
```
home/autotest/airflow/plugins/custom_operator.py
import logging
from airflow.models import BaseOperator
from airflow.plugins_manager import AirflowPlugin
from airflow.utils.decorator import apply_defaults
from airflow.contrib.hooks.mongo_hook import MongoHook
from airflow.operators.sensors import BaseSensorOperator
from datetime import datetime
log = logging.getLogger(__name__)
class InsertDb(BaseOperator):
#apply_defaults
def __init__(self, my_operator_param, *args, **kwargs):
self.operator_param = my_operator_param
super(InsertDb, self).__init__(*args, **kwargs)
def execute(self, context):
log.info("Inserting into the DB!")
db_hook = MongoHook(self, conn_id="https://localhost,localhost:27017/mydb")
db_conn = db_hook.get_conn()
insertSuccess = db_conn.insert_one(mongo_collection="demoCollection",doc = {"key1": "value1"}, mongo_db="mydb" )
log.info(insertSuccess)
class DbSensor(BaseSensorOperator):
#apply_defaults
def __init__(self, collection, query, mongo_conn_id="mongo_default", *args, **kwargs):
super(DbSensor,self).__init__(*args,**kwargs)
def poke(self,context):
db_hook = MongoHook(self, conn_id="https://localhost,localhost:27017/mydb")
db_conn = db_hook.get_conn()
result = db_conn.find(mongo_collection=collection, query=query, mongodb="mydb")
if result is None:
log.info("Data not available in DB")
return False
else:
log.info("Data is available in DB")
return True
class DbPlugin(AirflowPlugin):
name = "db_plugin"
operators = [InsertDb, DbSensor]
I am not able to launch the webserver.
Getting the errors:
[2019-04-12 12:35:16,046] {models.py:377} ERROR - Failed to import: /home/autotest/airflow/dags/custom_dag1.py
Traceback (most recent call last):
File "/home/autotest/virtualenv/airflow/lib/python3.6/site-packages/airflow/models.py", line 374, in process_file
m = imp.load_source(mod_name, filepath)
File "/home/autotest/virtualenv/airflow/lib/python3.6/imp.py", line 172, in load_source
module = _load(spec)
File "<frozen importlib._bootstrap>", line 684, in _load
File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 678, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/home/autotest/airflow/dags/custom_dag1.py", line 41, in <module>
operator_one_task = PythonOperator(task_id="task_1",python_callable="print_operator_one", dag=dag)
File "/home/autotest/virtualenv/airflow/lib/python3.6/site-packages/airflow/utils/decorators.py", line 98, in wrapper
result = func(*args, **kwargs)
File "/home/autotest/virtualenv/airflow/lib/python3.6/site-packages/airflow/operators/python_operator.py", line 81, in __init__
raise AirflowException('`python_callable` param must be callable')
airflow.exceptions.AirflowException: `python_callable` param must be callable
Do it without the quotes: python_callable=print_operator_third. This way you are passing a callable instead of a string.

Pyspark: Using lambda function and .withColumn produces a none-type error I'm having trouble understanding

I have the following code below. Essentially what I'm trying to do is to generate some new columns from the values in existing ones. After I do that, I save the dataframe with the new columns as a table in the cluster. Sorry I'm new to pyspark still.
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.sql.functions import udf, array
from pyspark.sql.types import DecimalType
import numpy as np
import math
df = sqlContext.sql('select * from db.mytable')
angle_av = udf(lambda (x, y): -10 if x == 0 else math.atan2(y/x)*180/np.pi, DecimalType(20,10))
df = df.withColumn('a_v_angle', angle_av(array('a_v_real', 'a_v_imag')))
df.createOrReplaceTempView('temp')
sqlContext.sql('create table new_table as select * from temp')
These operations actually don't produce any errors. I then attempt to store the df as a table and get the following error (i'm guessing since this is when the operations are actually executed):
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 171, in main
process()
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 166, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 103, in <lambda>
func = lambda _, it: map(mapper, it)
File "<string>", line 1, in <lambda>
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 70, in <lambda>
return lambda *a: f(*a)
File "<stdin>", line 14, in <lambda>
TypeError: unsupported operand type(s) for /: 'NoneType' and 'NoneType'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
This happens because input values are null / None. function should check its input and proceed accordingly.
f x == 0 or x is None
or just
if not x

Resources