Error while calling transform function using object of class - python-3.x

I have just started learning the spark-specific functions in python. I am writing one program that reads, transforms, and writes the data. I am using the transform function to do the transformation.
I have just started learning the spark-specific functions in python. I am writing one program that reads, transforms, and writes the data. I am using the transform function to do the transformation.
import logging
import sys
from datetime import datetime
from pyspark.sql import SparkSession
class Ingest:
def __init__(self):
self.spark_session = (
SparkSession.builder.master("cluster")
.appName("IngestData")
.getOrCreate()
)
self.input_directory = ("/mnt/input")
self.output_directory = ("/mnt/output")
def applyLogic(self):
df.cache()
listValues = ["X", "Y"]
df = df.withColumn(
"name",
when(
(df["Title"].contains("Gemini"))
& (df["Title"].contains("Cancer")),
"Intelligent"
)
return df
obj = Ingest()
transformedDF = obj.transform(applyLogic)
Getting an error while using the transform function. Any pointers on how to call the transform function using a class object or I am doing something wrong?I am able to run it without using the transform function

I was able to resolve it after making the below change
obj = Ingest()
transformedDF = df.transform(obj.applyLogic)

Related

pyspark called from an imported method that calls another method gives empty dataframe

I have a module called test_pyspark_module with following code:
class SparkTest:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
spark = self.spark
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
def fetch_data(sql, sql_text):
data = sql(sql_text).toPandas()
print(len(data))
I have a pyspark kernel running and I have following code in my jupyter notebook:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sqlContext = SQLContext(spark)
sql = sqlContext.sql
sql_text = """
SELECT *
FROM
<TABLENAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
from test_pyspark_module import *
st = SparkTest(spark, sql)
Now when I run st.fetch_data() I get 43000. However, when I run st.call_fetch_data() I get 0.
I wanted to see if something is going wrong with import so I implemented a duplicate of SparkTest locally calling it SparkTest2. However, this works as I expect with both functions returning 43000.
class SparkTest2:
def __init__(self, spark, sql):
self.spark = spark
self.sql = sql
def fetch_data(self, sql_text):
data = self.sql(sql_text).toPandas()
print(len(data))
def call_fetch_data(self):
sql_text = """
SELECT *
FROM
<TABLE_NAME>
WHERE date BETWEEN '${date-15}' and '${date-1}'
and app_id=1233
"""
return self.fetch_data(sql_text)
st2 = SparkTest2(spark, sql)
st2.fetch_data(sql_text) gives output 43000 and st2.call_fetch_data() gives output 43000
So seems like if try to run a class method that calls another method and the class is imported then it will fail to give correct results. Note that there is no error or exception. Just that I get 0 rows (I do get correct number of columns, i.e. 28)

How to call another object within Pyspark UDF function

I have a class Hello with a few methods
I would like to create a hello object within a UDF pyspark function, such as:
def foo_generation(query_params):
query_obj = Hello()
foo = query_obj.hello_method(query_params)
return foo
​
spark.udf.register("foo_generation", foo_generation)
df = df.withColumn("foo", F.expr("foo_generation(query_param_ES)"))
This doesn't appear to be working.
How should I generate a Hello object in this instance?
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
#udf(returnType=StringType())
def foo_generation(str):
query_obj = Hello()
foo = query_obj.hello_method(str)
return foo
df = df.withColumn("foo", F.expr("foo_generation(str)"))

writing Spark ETL job in high performance and object oriented way

A conceptual question to achieve high performance in Spark Job.
In ETL job,
I am creating a data frame in Extract phase from a datasource (s3 or database or raw file),
i.e. df = extract(configs)
Then In transform phase, i.e. df_transform = transform(df, configs), I am using multiple python methods to transform my dataframe.
i.e. cleaning(df), setup_different_conditional_edits(df), windowing_function_transformations(df), other_statestical_tranformations(df)
All the transform functions will return df after applying transformations. (i.e. pandas style)
i.e.
def transform(df, config):
df = cleaning(df)
df = setup_different_conditional_edits(df)
df = windowing_function_transformations(df)
df = other_statestical_tranformations(df)
return df
Now the data frame I am creating in extract(configs) is quite huge (~ 10-100 GB). Assuming that I have enough memory and compute resources, is it a good practice to pass huge data frame between two python functions ?
or using UDF is a best bet ?
Also If I am using python class, Is it a good practice to declare spark dataframe as class instance variable ?
i.e. (conceptual)
class Extract(object):
def __init__(configs):
""" declaring configs as instance variable """
self.configs = configs
def run(self):
""" All Extraction Methods """
return df
class Transform(object):
def __init__(df):
""" passing df as instance variable """
self.df = df
def run(self):
""" All Transformation Methods """
self.df = cleaning()
self.df = setup_different_conditional_edits()
self.df = windowing_function_transformations()
self.df = other_statestical_tranformations()
return self.df
class Load(object):
def __init__(df):
""" passing df as instance variable """
self.df = df
def run():
""" Logic for loading transformed df """
return True

How to call one class output from another class function in Python?

I have difficulty calling one class function output in another class. Below is the sample block of the code:
import pandas as pd
# First Class
class ppl():
def create_ppl(self):
ppll_info = pd.DataFrame({
'Name':['Bob','Amy'],
'Age':[12,19],
'Gender':['male','female']
})
instructor_info = pd.DataFrame({
'Name_Inst':['Tobby','John'],
'Age':[41,37],
'Gender':['male','male']
})
pl = ppl()
pl.create_ppl()
# second Class
class robot():
def create_rbt(self):
rbt_info = pd.DataFrame({
'Name':['Robot_1','Robt_2'],
'Manufacture':['ABC','XYZ'],
'Owner':['Bob','Amy']
})
full_table = pd.merge(
rbt_info,
pl.create_ppl(),
left_on='Owner',
right_on='Name', how='left'
)
return full_table
rbt = robot()
rbt.create_rbt()
I received the error TypeError: Can only merge Series or DataFrame objects, a <class 'NoneType'> was passed
It will work if I add return ppll_info to the end of the first class. Because my actual output dataframe is really large and I don't want to return the output. Is there any more elegant way to do so?
Thank you in advance for the help

Variables assignement before function

I have created a package to quickly transform datas using pandas and xlsxwriter.
This worked pretty well and I did a few functions successfully. But recently I've hit a wall:
For a few functions I need to define variables first but they are not basic types (list, tuple, str etc.) but for instance a dataframe. I've looked into global variables and saw they're are not recommanded (and wouldn't know where to put them) and I also looked into classes but I don't know how to solve my problem using them. I've also tried creating an empty dataframe but got an empty dataframe after the function.
What I'm trying to do is a read function with pandas for .csv or .xlsx and a function for saving with Xlsxwriter engine.
The goal is to change as little as possible in the code to transform data frequently and rapidly (e.g. i have functions doing LEFT,RIGHT like in Excel or even MIDDLE with column numbers) and have an easy and short code in main.py.
Here is the stripped down version of my code which uses 2 python files (main.py and format_operations.py). I have added commentaries where I'm having issues.
Thanks in advance for your help!
"""
main.py
"""
import format_operations as tbfrm #import another python file in the same folder
import pandas as pd
import numpy as np
import xlsxwriter.utility
#file settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
dfname = ??? #I need to create the variable but I don't know how
tbfrm.FCT_universal_read(dfname,file_full_path) #CAN'T GET IT TO WORK
#column operations and formatting
columns_numeric = [3,6] # (with pandas) list of columns with number values by iloc number, starts at 0 which is column A in Excel
tbfrm.FCT_columns_numeric(dfname,columns_numeric) #example of a WORKING function (if dfname is defined)
#write with Xlsxwriter engine
XLWRITER_DF = ??? #same problem as before, how to create the variable?
workbookvarname = ??? #same here
worksheetvarname = ??? # same here
tbfrm.FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname) #CAN'T GET IT TO WORK
#### WORKING piece of code I want to execute after saving with Xlsxwriter engine ####
worksheet.set_zoom(80)
# Conditional formatting
color_range_1 = "J1:J{}".format(number_rows+1)
FORMAT1 = workbook.add_format({'bg_color': '#FFC7CE','font_color': '#9C0006'})
FORMAT2 = workbook.add_format({'bg_color': '#C6EFCE','font_color': '#006100'})
worksheet.conditional_format(color_range_1, {'type': 'bottom','value': '5','format': FORMAT1})
worksheet.conditional_format(color_range_1, {'type': 'top','value': '5','format': FORMAT2})
Other file:
"""
format_operations.py
"""
import pandas as pd
import numpy as np
import xlsxwriter.utility
def FCT_universal_read(dfname,file_full_path):
if ".xls" in file_full_path:
dfname = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if ".csv" in file_full_path:
dfname = pd.read_csv(file_full_path)
# save file with XLSXWriter engine for additional options to pandas
def FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbookvarname = XLWRITER_DF.book
worksheetvarname = XLWRITER_DF.sheets[sheet_name_save_to]
#format as numbers
def FCT_columns_numeric(dfname,columns_numeric):
for x in columns_numeric:
dfname.iloc[:,x] = pd.to_numeric(dfname.iloc[:,x])
Your FCT_universal_read function should not modify a dataframe but instead return a new one:
def FCT_universal_read(file_full_path):
if file_full_path.split('.')[-1] == "xls":
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if file_full_path.split('.')[-1] == "csv":
df = pd.read_csv(file_full_path)
return df
And in your main, do:
dfname = tbfrm.FCT_universal_read(file_full_path)
Same answer for FCT_df_xlsxwriter, you should rewrite it with a return so that you can do:
XLWRITER_DF, workbookvarname,worksheetvarname = tbfrm.FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to)
To grasp how python is dealing with the arguments you pass to a function, you should read these blog posts:
https://jeffknupp.com/blog/2012/11/13/is-python-callbyvalue-or-callbyreference-neither/
https://robertheaton.com/2014/02/09/pythons-pass-by-object-reference-as-explained-by-philip-k-dick/
You need to update FCT_universal_read so that it returns the dataframe you want. There is no need to define the dataframe outside the function, simply create and return it
df = FCT_universal_read('/your/file/path')
def FCT_universal_read(file_full_path):
if ".xls" in file_full_path:
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
return df
if ".csv" in file_full_path:
df = pd.read_csv(file_full_path)
return df
Thanks so much to both of you !! I get the logic now :)! Thanks also for the documentation.
I sucessfully managed to do both functions. I had been struggling for several hours.
I like the .split function that you used which ensures the script only looks at the extension.
I updated FCT_xlsxwriter and FCT_universal_read as you were saying. Here are both functions corrected:
'''
format_operations.py
'''
def FCT_universal_read(file_full_path):
if "xls" in file_full_path.split('.')[-1]:
dfname = pd.read_excel(file_full_path) #example: C:/Tests/Bigdata.xlsx
return dfname
if "csv" in file_full_path.split('.')[-1]:
dfname = pd.read_csv(file_full_path)
return dfname
def FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbook = XLWRITER_DF.book
worksheet = XLWRITER_DF.sheets[sheet_name_save_to]
return XLWRITER_DF,workbook,worksheet
Here is how I call the two functions:
'''
main.py
'''
import format_operations as tbfrm
import pandas as pd
import xlsxwriter.utility
#settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
#functions
FILE_DF = tbfrm.FCT_universal_read(file_full_path)
XLWRITER_DF,workbook,worksheet = tbfrm.FCT_df_xlsxwriter(FILE_DF,file_save_to,sheet_name_save_to)

Resources