How to call another object within Pyspark UDF function - apache-spark

I have a class Hello with a few methods
I would like to create a hello object within a UDF pyspark function, such as:
def foo_generation(query_params):
query_obj = Hello()
foo = query_obj.hello_method(query_params)
return foo
​
spark.udf.register("foo_generation", foo_generation)
df = df.withColumn("foo", F.expr("foo_generation(query_param_ES)"))
This doesn't appear to be working.
How should I generate a Hello object in this instance?

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
#udf(returnType=StringType())
def foo_generation(str):
query_obj = Hello()
foo = query_obj.hello_method(str)
return foo
df = df.withColumn("foo", F.expr("foo_generation(str)"))

Related

pyspark called from an imported method that calls another method gives empty dataframe

I have a module called test_pyspark_module with following code:
class SparkTest:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
spark = self.spark
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
def fetch_data(sql, sql_text):
data = sql(sql_text).toPandas()
print(len(data))
I have a pyspark kernel running and I have following code in my jupyter notebook:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sqlContext = SQLContext(spark)
sql = sqlContext.sql
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
from test_pyspark_module import *
st = SparkTest(spark, sql)
Now when I run st.fetch_data() I get 43000. However, when I run st.call_fetch_data() I get 0.
I wanted to see if something is going wrong with import so I implemented a duplicate of SparkTest locally calling it SparkTest2. However, this works as I expect with both functions returning 43000.
class SparkTest2:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLE_NAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
st2 = SparkTest2(spark, sql)
st2.fetch_data(sql_text) gives output 43000 and st2.call_fetch_data() gives output 43000
So seems like if try to run a class method that calls another method and the class is imported then it will fail to give correct results. Note that there is no error or exception. Just that I get 0 rows (I do get correct number of columns, i.e. 28)

Error while calling transform function using object of class

I have just started learning the spark-specific functions in python. I am writing one program that reads, transforms, and writes the data. I am using the transform function to do the transformation.
I have just started learning the spark-specific functions in python. I am writing one program that reads, transforms, and writes the data. I am using the transform function to do the transformation.
import logging
import sys
from datetime import datetime
from pyspark.sql import SparkSession
class Ingest:
def __init__(self):
self.spark_session = (
SparkSession.builder.master("cluster")
.appName("IngestData")
.getOrCreate()
)
self.input_directory = ("/mnt/input")
self.output_directory = ("/mnt/output")
def applyLogic(self):
df.cache()
listValues = ["X", "Y"]
df = df.withColumn(
"name",
when(
(df["Title"].contains("Gemini"))
& (df["Title"].contains("Cancer")),
"Intelligent"
)
return df
obj = Ingest()
transformedDF = obj.transform(applyLogic)
Getting an error while using the transform function. Any pointers on how to call the transform function using a class object or I am doing something wrong?I am able to run it without using the transform function
I was able to resolve it after making the below change
obj = Ingest()
transformedDF = df.transform(obj.applyLogic)

Pandas DataFrame Accessor Type Hints

Pandas allows you to extend its DataFrame class by using the pd.api.extensions.register_dataframe_accessor() decorator.
While this is functional, it doesn't offer any additional type hinting capabilities.
For example, I would expect the following to type check OK and even provide type hints
import pandas as pd
#pd.api.extensions.register_dataframe_accessor('dataset')
class Extension:
def __init__(self, df: pd.DataFrame):
self._df = df
def foo(self, bar) -> str:
return "foobar";
foo = pd.DataFrame({"foo":["bar"]})
foo.dataset.foo("bar")
^
No Suggestions
How can I get dataframe accessors to provide autocomplete?
This can be done somewhat hackishly using typing.TYPE_CHECKING and a bit of inheritance.
from typing import TYPE_CHECKING
import pandas as pd
#pd.api.extensions.register_dataframe_accessor('dataset')
class Extension:
def __init__(self, df: pd.DataFrame):
self._df = df
def foo(self, bar) -> str:
return "foobar";
if TYPE_CHECKING:
class DataFrame(pd.DataFrame):
dataset: Extension
foo: 'DataFrame' = pd.DataFrame({"foo":["bar"]})
# ^ you have to do this every time you transform the DataFrame
foo.dataset.foo("bar")
# ^ autocomplete is now provided
Unfortunately, PyCharm does not check the __annotations__ dictionary, or really do any dynamic type checking, so there doesn't appear to be any more universal solutions.

Error when calling Dask groupby with custom aggregate

The following error occurs when running the below example test, what am I doing wrong?
Error: Exception('Column(s) id already selected',)
Code:
import unittest
import dask
import pandas as pd
import dask.dataframe as dd
class TestDaskCustomAgg(unittest.TestCase):
def mode(self, x):
val = pd.Series.mode(x)
if val.empty:
return np.NaN
return val[0]
def test_get_transactions(self):
df = dask.datasets.timeseries()
custom_agg = dd.Aggregation('custom_agg', agg=lambda x: self.mode(x), chunk=lambda x0: self.mode(x0))
df.groupby('name').agg(custom_agg).compute()

Pyspark S3A Access Denied Exception for cross account STS assume role

I setup an AWS Glue job to process S3 files present in another AWS account B. The IAM role in Account A(glue job IAM role) is using STS to assume a role in Account B which provides access to my desired files. Account B's IAM role have Trust relationship to the Glue job role in Account A. I was able to print access key and secret key, so assuming the STS is working well.
I get below error:
An error occurred while calling o83.json. com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied;
what is the right implementation of S3A connector as i get Access Denied Exception.
Here is my code:
from __future__ import print_function
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql import HiveContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import explode_outer
from pyspark.sql.functions import substring_index
from pyspark.sql.functions import input_file_name
from pyspark.sql import functions as f
import sys
import os
import os
import boto3
import sys
import errno
import time
import datetime
from datetime import timedelta, date
from pyspark.sql.functions import split
from pyspark.sql.functions import substring
from boto3.session import Session
spark = SparkSession\
.builder\
.appName("JsonInputFormat")\
.enableHiveSupport()\
.getOrCreate()\
sc = spark.sparkContext
hive_context = HiveContext(sc)
hive_context.setConf("hive.exec.dynamic.partition", "true")
hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
hive_context.setConf("hive.serialization.extend.nesting.levels","true")
sqlCtx = HiveContext(sc)
client = boto3.client('sts')
response = client.assume_role(RoleArn='ROLE_TO_ASSUME', RoleSessionName='AssumeRoleSession1')
credentials = response['Credentials']
ACCESS_KEY = credentials['AccessKeyId']
SECRET_KEY = credentials['SecretAccessKey']
print('access key is {}'.format(ACCESS_KEY))
print('secret key is {}'.format(SECRET_KEY))
print("Hadoop version: " + sc._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
session = Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3a.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3-us-east-1.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
print('path is {}'.format(path))
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
sqlCtx.sql("set spark.sql.caseSensitive=true")
yesterday = date.today() - timedelta(1)
daybefore=yesterday.strftime("%Y-%m-%d")
currentdate=time.strftime("%Y-%m-%d")
key = 'KEY={}'.format(str(daybefore))
bucket = 'BUKCET_NAME'
df_base=spark.read.json('s3a://{}/{}/*/'.format(bucket,key))
base_schema=df_base.schema
datePrefix=str(daybefore)
source='s3a://{}/{}'.format(bucket,key)
df1=spark.read.json(source,schema=base_schema)
schema=df1.schema.jsonValue()
columns_list=flatten_schema(schema)
print('columns list is {}'.format(columns_list))
df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.contents).alias("contents_flat"))
df3=df3.drop("contents")
print('df3 is {}'.format(df3))
schema4=df3.schema.jsonValue()
columns_list4=flatten_schema(schema4)
print('columns list 4 is {}'.format(columns_list4))
df5 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
print('df5 is {}'.format(df5))
schema5=df5.schema.jsonValue()
columns_list5=flatten_schema(schema5)
print('columns list 5 is {}'.format(columns_list5))
df6 = df5.select(*(col(x).alias(x.replace('contents_flat','contents')) for x in columns_list5))
print('df6 is {}'.format(df6))
schema6=df6.schema.jsonValue()
columns_list6=flatten_schema(schema6)
print('column list 6 is {}'.format(columns_list6))
df7 = df6.select(*(col(x) for x in columns_list6)) #above line edited down here
schema7=df7.schema.jsonValue()
print('schema7 is {}'.format(schema7))
columns_list7=flatten_schema(schema7)
print('columns list 7 is {}'.format(columns_list7))
df7 = df7.select(*(col(x).alias(x.replace('.','_')) for x in columns_list7))
df7=df7.select("*",explode_outer(df7.business_address_latLng_warnings).alias("business_address_latLng_warnings_flat"))
df7=df7.drop("business_address_latLng_warnings")
print('df7 is {}'.format(df7))
df8 = df7.withColumn("filename",input_file_name())
split_col = split(df8['filename'], 'short_date=')
df9 = df8.withColumn('shortfilename', split_col.getItem(1))
df_final = df9.withColumn('filedate', substring('shortfilename',1,10)).drop('shortfilename')
print('df_final is {}'.format(df_final))
df_final.write.mode('append').csv('s3://{bucket}/{folder1}/', header='true')
spark.stop()```

Resources