How to call avro SchemaConverters in Pyspark - apache-spark

Although PySpark has Avro support, it does not have the SchemaConverters method. I may be able to use Py4J to accomplish this, but I have never used a Java package within Python.
This is the code I am using
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
def _test():
# Create SparkSession
spark = SparkSession.builder \
.master("local[1]") \
.appName("sparvro") \
.getOrCreate()
avroSchema = sc._jvm.org.apache.spark.sql.avro.SchemaConverters.toAvroType(StructType([ StructField("firstname", StringType(), True)]))
if __name__ == "__main__":
_test()
however, I keep getting this error
AttributeError: 'StructField' object has no attribute '_get_object_id'

Related

Pyspark Streaming with checkpointing is failing

I am working on spark streaming data received through custom receiver using pyspark. To make it fault tolerant I enabled checkpointing . Since then the code which was running fine before checkpointing was introduced is now throwing error.
Error Message :
pubsubStream.flatMap(lambda x : x).map(lambda x: convertjson(x)).foreachRDD(lambda rdd : dstream_to_rdd(rdd))
File "/home/test/spark_checkpointing/spark_checkpoint_test.py", line 227, in dstream_to_rdd
df = spark_session.read.option("multiline","true")\
NameError: name 'sparkContext' is not defined
Code is as below :
import sys
from pyspark import SparkContext,SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pubsub import PubsubUtils
import json
import time
from pyspark.sql.types import (StructField, StringType, StructType, IntegerType, FloatType,LongType,BooleanType)
from google.cloud import storage
import pyspark
conf_bucket_name = <bucket_name>
#Events list
events_list = ["Event1","Event2"]
# This chunk of schema creation will be automated later
# and most probable moved outside
full_schema = StructType([
StructField('_id', StructType([
StructField('_data', StringType(), True)
])),
StructField('ct', StructType([
StructField('$timestamp' , StructType([
StructField('i', LongType(), True),
StructField('t', LongType(), True),
]))])),
StructField('fg', StructType([
StructField('sgs' , StructType([
StructField('col1',StringType(), True),
StructField('col2',StringType(), True)
]))])),
StructField('col6', StringType(), True),
StructField('_corrupt_record', StringType(), True)
])
def convertjson(ele):
temp = json.loads(ele.decode('utf-8'))
if temp['col6'] == 'update':
del temp['updateDescription']
return temp
return temp
def dstream_to_rdd(x):
if not x.isEmpty():
df = spark_session.read.option("multiline","true")\
.option("mode", "PERMISSIVE")\
.option("primitivesAsString", "false")\
.schema(full_schema)\
.option("columnNameOfCorruptRecord", "_corrupt_record")\
.option("allowFieldAddition","true")\
.json(x)
df.show(truncate=True)
#df.printSchema()
def createContext(all_config):
# If you do not see this printed, that means the StreamingContext has been loaded
# from the new checkpoint
print("Creating new context")
ssc = StreamingContext(spark_session.sparkContext, 10)
pubsubStream = PubsubUtils.createStream(ssc, <SUBSCRIPTION>, 10000, True)
# Print the records of dstream
pubsubStream.pprint() /// Dstreams are getting printed on console
#Dstream is transforme using flatmap to flatten it as tuple may have multiple records
#Then converted it to json forma and finally pushed to BQ
pubsubStream.flatMap(lambda x : x).map(lambda x: convertjson(x)).foreachRDD(lambda rdd : dstream_to_rdd(rdd))
pubsubStream.checkpoint(50)
return ssc
if __name__ == "__main__":
#Declaration of spark session and streaming session
checkpointDir = <checkpointdir path on google cloud storage>
spark_session = SparkSession.builder.appName("Test_spark_checkpoint").getOrCreate()
spark_session.conf.set('temporaryGcsBucket', <temp bucket name>)
ssc = StreamingContext.getOrCreate(checkpointDir,lambda: createContext(all_config))
ssc.start()
ssc.awaitTermination()
The error message is of sparkContext not defined. On doing dir(spark_session) this I found
that it returns list of the attributes and methods which contains sparkContext . Am I suppose to pass it explicitly. What is the miss here?
Also help me understanding the position of checkpointing used in code is correct or not.
Update piece of code : Tried with sparkContext instead of sparkSession
conf = SparkConf()
conf.setAppName("Test_spark_checkpoint")
conf.set('temporaryGcsBucket', <temp bucket>)
sc = SparkContext(conf=conf)
print(dir(sc))
ssc = StreamingContext.getOrCreate(checkpointDir,lambda: createContext(all_config))
df = sc.read.option("multiline","true")\
.option("mode", "PERMISSIVE")\
.option("primitivesAsString", "false")\
.schema(full_schema)\
.option("columnNameOfCorruptRecord", "_corrupt_record")\
.option("allowFieldAddition","true")\
.json(x)
df.show(truncate=True)

Creating pyspark's spark context py4j java gateway object

I am trying to convert a java dataframe to a pyspark dataframe. For this I am creating a dataframe(or dataset of Row) in java process and starting a py4j.GatewayServer server process on java side. Then on python side I am creating a py4j.java_gateway.JavaGateway() client object and passing this to pyspark's SparkContext constructor to link it to the jvm process already started. But I am getting this error :-
File: "path_to_virtual_environment/lib/site-packages/pyspark/conf.py", line 120, in __init__
self._jconf = _jvm.SparkConf(loadDefaults)
TypeError: 'JavaPackage' object is not callable
Can someone please help ?
Below is the code I am using:-
Java Code:-
import py4j.GatewayServer
public class TestJavaToPythonTransfer{
Dataset<Row> df1;
public TestJavaToPythonTransfer(){
SparkSession spark =
SparkSession.builder().appName("test1").config("spark.master","local").getOrCreate();
df1 = spark.read().json("path/to/local/json_file");
}
public Dataset<Row> getDf(){
return df1;
}
public static void main(String args[]){
GatewayServer gatewayServer = new GatewayServer(new TestJavaToPythonTransfer());
gatewayServer.start();
System.out.println("Gateway server started");
}
}
Python code:-
from pyspark.sql import SQLContext, DataFrame
from pyspark import SparkContext, SparkConf
from py4j.java_gateway import JavaGateway
gateway = JavaGateway()
conf = SparkConf().set('spark.io.encryption.enabled','true')
py_sc = SparkContext(gateway=gateway,conf=conf)
j_df = gateway.getDf()
py_df = DataFrame(j_df,SQLContext(py_sc))
print('print dataframe content')
print(dpy_df.collect())
Command to run python code:-
python path_to_python_file.py
I also tried doing this:-
$SPARK_HOME/bin/spark-submit --master local path_to_python_file.py
But here though the code is not throwing any error but it is not printing anything to terminal. Do I need to set some spark conf for this?
P.S - apologies in advance if there is a typo mistake in code or mistake, since I could not copy the code and error stack directly from my firm's IDE.
There is a missing call to entry_point before calling getDf()
So, try this:
app = gateway.entry_point
j_df = app.getDf()
Additionally, I have create working copy using Python and Scala (hope you dont mind) below that shows how on Scala side py4j gateway is started with Spark session and a sample DataFrame and on Python side I have accessed that DataFrame and converted to Python List[Tuple] before converting back to a DataFrame for a Spark session on Python side:
Python:
from py4j.java_gateway import JavaGateway
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StructField
if __name__ == '__main__':
gateway = JavaGateway()
spark_app = gateway.entry_point
df = spark_app.df()
# Note "apply" method here comes from Scala's companion object to access elements of an array
df_to_list_tuple = [(int(i.apply(0)), int(i.apply(1))) for i in df]
spark = (SparkSession
.builder
.appName("My PySpark App")
.getOrCreate())
schema = StructType([
StructField("a", IntegerType(), True),
StructField("b", IntegerType(), True)])
df = spark.createDataFrame(df_to_list_tuple, schema)
df.show()
Scala:
import java.nio.file.{Path, Paths}
import org.apache.spark.sql.SparkSession
import py4j.GatewayServer
object SparkApp {
val myFile: Path = Paths.get(System.getProperty("user.home") + "/dev/sample_data/games.csv")
val spark = SparkSession.builder()
.master("local[*]")
.appName("My app")
.getOrCreate()
val df = spark
.read
.option("header", "True")
.csv(myFile.toString)
.collect()
}
object Py4JServerApp extends App {
val server = new GatewayServer(SparkApp)
server.start()
print("Started and running...")
}

Module not found error when importing Pyspark Delta Lake module

I'm running Pyspark with delta lake but when I try to import the delta module I get a ModuleNotFoundError: No module named 'delta'. This is on a machine without an internet connection so I had to download the delta-core jar manually from Maven and place it into the %SPARK_HOME%/jars folder.
My program works without any issues and I'm able to write and read from delta lake so I'm happy I've got the correct jar. But when I try and import the delta module from delta.tables import * I get the error.
For information my code is:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, FloatType, StructType, StructField
from pyspark.sql.functions import input_file_name
from Constants import Constants
if __name__ == "__main__":
constants = Constants()
spark = SparkSession.builder.master("local[*]")\
.appName("Delta Lake Testing")\
.getOrCreate()
# have to start spark session before importing: https://docs.delta.io/latest/quick-start.html#python
from delta.tables import *
# set logging level to limit output
spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.session.timeZone", "UTC")
# push additional python files to the worker nodes
base_path = os.path.abspath(os.path.dirname(__file__))
spark.sparkContext.addPyFile(os.path.join(base_path, 'Constants.py'))
# start pipeline
schema = StructType([StructField("Timestamp", TimestampType(), False),\
StructField("ParamOne", FloatType(), False),\
StructField("ParamTwo", FloatType(), False),\
StructField("ParamThree", FloatType(), False)])
df = spark.readStream\
.option("header", "true")\
.option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
.schema(schema)\
.csv(constants.input_path)\
.withColumn("input_file_name", input_file_name())
df.writeStream\
.format("delta")\
.outputMode("append")\
.option("checkpointLocation", constants.checkpoint_location)\
.start("/tmp/bronze")
# await on stream
sqm = spark.streams
sqm.awaitAnyTermination()
This is using Spark v2.4.4 and Python v3.6.1 and the job is submitted using spark-submit path/to/job.py
%pyspark
sc.addPyFile("**LOCATION_OF_DELTA_LAKE_JAR_FILE**")
from delta.tables import *

Convert Spark Structured DataFrame to Pandas using pandas_udf

I need to read csv files as stream and then convert that to pandas dataframe.
Here is what I have done so far
DataShema = StructType([ StructField("TimeStamp", LongType(), True), \
StructField("Count", IntegerType(), True), \
StructField("Reading", FloatType(), True) ])
group_columns = ['TimeStamp','Count','Reading']
#pandas_udf(DataShema, PandasUDFType.GROUPED_MAP)
def get_pdf(pdf):
return pd.DataFrame([pdf[group_columns]],columns=[group_columns])
# getting Surge data from the files
SrgDF = spark \
.readStream \
.schema(DataShema) \
.csv("ProcessdedData/SurgeAcc")
mydf = SrgDF.groupby(group_columns).apply(get_pdf)
qrySrg = SrgDF \
.writeStream \
.format("console") \
.start() \
.awaitTermination()
I believe from another source (Convert Spark Structure Streaming DataFrames to Pandas DataFrame) that converting structured streaming dataframe to pandas is not directly possible and it seems that pandas_udf is the right approach but cannot figure out exactly how to achieve this. I need the pandas dataframe to pass into my functions.
Edit
when I run the code (changing the query to mydf rather than SrgDF) then I get the following error: pyspark.sql.utils.StreamingQueryException: 'Writing job aborted.\n=== Streaming Query ===\nIdentifier: [id = 18a15e9e-9762-4464-b6d1-cb2db8d0ac41, runId = e3da131e-00d1-4fed-82fc-65bf377c3f99]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {FileStreamSource[file:/home/mls5/Work_Research/Codes/Misc/Python/MachineLearning_ArtificialIntelligence/00_Examples/01_ApacheSpark/01_ComfortApp/ProcessdedData/SurgeAcc]: {"logOffset":0}}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nFlatMapGroupsInPandas [Count#1], get_pdf(TimeStamp#0L, Count#1, Reading#2), [TimeStamp#10L, Count#11, Reading#12]\n+- Project [Count#1, TimeStamp#0L, Count#1, Reading#2]\n +- StreamingExecutionRelation FileStreamSource[file:/home/mls5/Work_Research/Codes/Misc/Python/MachineLearning_ArtificialIntelligence/00_Examples/01_ApacheSpark/01_ComfortApp/ProcessdedData/SurgeAcc], [TimeStamp#0L, Count#1, Reading#2]\n'
19/05/20 18:32:29 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
/usr/local/lib/python3.6/dist-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream
warnings.warn("pyarrow.open_stream is deprecated, please use ".
EDIT-2
Here is the code to reproduce the error
import sys
from pyspark import SparkContext
from pyspark.sql import Row, SparkSession, SQLContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.streaming import StreamingContext
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pyarrow as pa
import glob
#####################################################################################
if __name__ == '__main__' :
spark = SparkSession \
.builder \
.appName("RealTimeIMUAnalysis") \
.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# reduce verbosity
sc = spark.sparkContext
sc.setLogLevel("ERROR")
##############################################################################
# using the saved files to do the Analysis
DataShema = StructType([ StructField("TimeStamp", LongType(), True), \
StructField("Count", IntegerType(), True), \
StructField("Reading", FloatType(), True) ])
group_columns = ['TimeStamp','Count','Reading']
#pandas_udf(DataShema, PandasUDFType.GROUPED_MAP)
def get_pdf(pdf):
return pd.DataFrame([pdf[group_columns]],columns=[group_columns])
# getting Surge data from the files
SrgDF = spark \
.readStream \
.schema(DataShema) \
.csv("SurgeAcc")
mydf = SrgDF.groupby('Count').apply(get_pdf)
#print(mydf)
qrySrg = mydf \
.writeStream \
.format("console") \
.start() \
.awaitTermination()
To run, you need to create a folder named SurgeAcc where the code is and create a csv file inside with the following format:
TimeStamp,Count,Reading
1557011317299,45148,-0.015494
1557011317299,45153,-0.015963
1557011319511,45201,-0.015494
1557011319511,45221,-0.015494
1557011315134,45092,-0.015494
1557011315135,45107,-0.014085
1557011317299,45158,-0.015963
1557011317299,45163,-0.015494
1557011317299,45168,-0.015024`
Your return pandas_udf dataframe is not matching with the schema specified.
Please note that input to the pandas_udf will be pandas dataframe and also returns pandas dataframe.
You can use all pandas functions inside the pandas_udf. Only thing you have to make sure is the ReturnDataShema should match with actual output of the function.
ReturnDataShema = StructType([StructField("TimeStamp", LongType(), True), \
StructField("Count", IntegerType(), True), \
StructField("Reading", FloatType(), True), \
StructField("TotalCount", FloatType(), True)])
#pandas_udf(ReturnDataShema, PandasUDFType.GROUPED_MAP)
def get_pdf(pdf):
# This following stmt is causing schema mismatch
# return pd.DataFrame([pdf[group_columns]],columns=[group_columns])
# If you want to return all the rows of pandas dataframe
# you can simply
# return pdf
# If you want to do any aggregations, you can do like the below, or use pandas query
# but make sure the return pandas dataframe complies with ReturnDataShema
total_count = pdf['Count'].sum()
return pd.DataFrame([(pdf.TimeStamp[0],pdf.Count[0],pdf.Reading[0],total_count)])

How to get detailed Information about Spark Stages&Tasks

I´ve set up an Apache Spark cluster with a master and one Worker and I use Python with Spyder as IDE. Everything works fine so far, but I need detailed Information about the task distribution in the Cluster. I know that there is the Spark Web UI but I would like to have Information directly in my Spyder console. So I mean which part of my code/script is done by which Worker/Master. I think with the python package "socket" and socket.gethostname() it must be possible to get more Information. I really look forward to for an help.
Here is my code:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import matplotlib.pyplot as plt
from datetime import datetime
from pyspark.sql.functions import udf
from datetime import datetime
import pyspark.sql.functions as F
#spark = SparkSession \
# .builder \
# .appName('weather_data') \
# .getOrCreate()
spark = SparkSession \
.builder \
.appName("weather_data_u") \
.master('master_ip#...')\
.getOrCreate()
data.show()
data.printSchema()
data_selected = data\
.select(data['Date'],
data['TemperatureHighC'],
data['TemperatureAvgC'],
data['TemperatureLowC'],
data['DewpointHighC'],
data['DewpointAvgC'],
data['DewpointLowC'],
data['HumidityAvg'],
data['WindSpeedMaxKMH'],
data['WindSpeedAvgKMH'],
data['GustSpeedMaxKMH'],
data['PrecipitationSumCM'])
data_selected.printSchema()
data_selected.show()
f = udf(lambda row: datetime.strptime(row, '%Y-%m-%d'), TimestampType())
data_selected = data_selected\
.withColumn('date', f(data['Date'].cast(StringType())))\
.withColumn('t_max', data['TemperatureHighC'].cast(DoubleType()))\
.withColumn('t_mean', data['TemperatureAvgC'].cast(DoubleType()))\
.withColumn('t_min', data['TemperatureLowC'].cast(DoubleType()))\
.withColumn('dew_max', data['DewpointHighC'].cast(DoubleType()))\
.withColumn('dew_mean', data['DewpointAvgC'].cast(DoubleType()))\
.withColumn('dew_min', data['DewpointLowC'].cast(DoubleType()))\
.cache()
data_selected.show()
t_mean_calculated = data_selected\
.groupBy(F.date_format(data_selected.date, 'M'))\
.agg(F.mean(data_selected.t_max))\
.orderBy('date_format(date, M)')
t_mean_calculated = t_mean_calculated\
.withColumn('month', t_mean_calculated['date_format(date, M)'].cast(IntegerType()))\
.withColumnRenamed('avg(t_max)', 't_max_month')\
.orderBy('month')\
.drop(t_mean_calculated['date_format(date, M)'])\
.select('month', 't_max_month')
t_mean_calculated = t_mean_calculated.collect()
As reported by #Jacek Laskowski himself, you can use Spark-Core local properties to modify job-name in web-ui
callSite.short
callSite.long
For instance, my Spark-application syncs multiple MySQL tables to S3, and I set
spark.sparkContext.setLocalProperty("callSite.short", currentTableName)
so reflect current table-name in web-ui

Resources