Pyspark Streaming with checkpointing is failing - apache-spark

I am working on spark streaming data received through custom receiver using pyspark. To make it fault tolerant I enabled checkpointing . Since then the code which was running fine before checkpointing was introduced is now throwing error.
Error Message :
pubsubStream.flatMap(lambda x : x).map(lambda x: convertjson(x)).foreachRDD(lambda rdd : dstream_to_rdd(rdd))
File "/home/test/spark_checkpointing/spark_checkpoint_test.py", line 227, in dstream_to_rdd
df = spark_session.read.option("multiline","true")\
NameError: name 'sparkContext' is not defined
Code is as below :
import sys
from pyspark import SparkContext,SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pubsub import PubsubUtils
import json
import time
from pyspark.sql.types import (StructField, StringType, StructType, IntegerType, FloatType,LongType,BooleanType)
from google.cloud import storage
import pyspark
conf_bucket_name = <bucket_name>
#Events list
events_list = ["Event1","Event2"]
# This chunk of schema creation will be automated later
# and most probable moved outside
full_schema = StructType([
StructField('_id', StructType([
StructField('_data', StringType(), True)
])),
StructField('ct', StructType([
StructField('$timestamp' , StructType([
StructField('i', LongType(), True),
StructField('t', LongType(), True),
]))])),
StructField('fg', StructType([
StructField('sgs' , StructType([
StructField('col1',StringType(), True),
StructField('col2',StringType(), True)
]))])),
StructField('col6', StringType(), True),
StructField('_corrupt_record', StringType(), True)
])
def convertjson(ele):
temp = json.loads(ele.decode('utf-8'))
if temp['col6'] == 'update':
del temp['updateDescription']
return temp
return temp
def dstream_to_rdd(x):
if not x.isEmpty():
df = spark_session.read.option("multiline","true")\
.option("mode", "PERMISSIVE")\
.option("primitivesAsString", "false")\
.schema(full_schema)\
.option("columnNameOfCorruptRecord", "_corrupt_record")\
.option("allowFieldAddition","true")\
.json(x)
df.show(truncate=True)
#df.printSchema()
def createContext(all_config):
# If you do not see this printed, that means the StreamingContext has been loaded
# from the new checkpoint
print("Creating new context")
ssc = StreamingContext(spark_session.sparkContext, 10)
pubsubStream = PubsubUtils.createStream(ssc, <SUBSCRIPTION>, 10000, True)
# Print the records of dstream
pubsubStream.pprint() /// Dstreams are getting printed on console
#Dstream is transforme using flatmap to flatten it as tuple may have multiple records
#Then converted it to json forma and finally pushed to BQ
pubsubStream.flatMap(lambda x : x).map(lambda x: convertjson(x)).foreachRDD(lambda rdd : dstream_to_rdd(rdd))
pubsubStream.checkpoint(50)
return ssc
if __name__ == "__main__":
#Declaration of spark session and streaming session
checkpointDir = <checkpointdir path on google cloud storage>
spark_session = SparkSession.builder.appName("Test_spark_checkpoint").getOrCreate()
spark_session.conf.set('temporaryGcsBucket', <temp bucket name>)
ssc = StreamingContext.getOrCreate(checkpointDir,lambda: createContext(all_config))
ssc.start()
ssc.awaitTermination()
The error message is of sparkContext not defined. On doing dir(spark_session) this I found
that it returns list of the attributes and methods which contains sparkContext . Am I suppose to pass it explicitly. What is the miss here?
Also help me understanding the position of checkpointing used in code is correct or not.
Update piece of code : Tried with sparkContext instead of sparkSession
conf = SparkConf()
conf.setAppName("Test_spark_checkpoint")
conf.set('temporaryGcsBucket', <temp bucket>)
sc = SparkContext(conf=conf)
print(dir(sc))
ssc = StreamingContext.getOrCreate(checkpointDir,lambda: createContext(all_config))
df = sc.read.option("multiline","true")\
.option("mode", "PERMISSIVE")\
.option("primitivesAsString", "false")\
.schema(full_schema)\
.option("columnNameOfCorruptRecord", "_corrupt_record")\
.option("allowFieldAddition","true")\
.json(x)
df.show(truncate=True)

Related

How to call avro SchemaConverters in Pyspark

Although PySpark has Avro support, it does not have the SchemaConverters method. I may be able to use Py4J to accomplish this, but I have never used a Java package within Python.
This is the code I am using
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
def _test():
# Create SparkSession
spark = SparkSession.builder \
.master("local[1]") \
.appName("sparvro") \
.getOrCreate()
avroSchema = sc._jvm.org.apache.spark.sql.avro.SchemaConverters.toAvroType(StructType([ StructField("firstname", StringType(), True)]))
if __name__ == "__main__":
_test()
however, I keep getting this error
AttributeError: 'StructField' object has no attribute '_get_object_id'

Creating pyspark's spark context py4j java gateway object

I am trying to convert a java dataframe to a pyspark dataframe. For this I am creating a dataframe(or dataset of Row) in java process and starting a py4j.GatewayServer server process on java side. Then on python side I am creating a py4j.java_gateway.JavaGateway() client object and passing this to pyspark's SparkContext constructor to link it to the jvm process already started. But I am getting this error :-
File: "path_to_virtual_environment/lib/site-packages/pyspark/conf.py", line 120, in __init__
self._jconf = _jvm.SparkConf(loadDefaults)
TypeError: 'JavaPackage' object is not callable
Can someone please help ?
Below is the code I am using:-
Java Code:-
import py4j.GatewayServer
public class TestJavaToPythonTransfer{
Dataset<Row> df1;
public TestJavaToPythonTransfer(){
SparkSession spark =
SparkSession.builder().appName("test1").config("spark.master","local").getOrCreate();
df1 = spark.read().json("path/to/local/json_file");
}
public Dataset<Row> getDf(){
return df1;
}
public static void main(String args[]){
GatewayServer gatewayServer = new GatewayServer(new TestJavaToPythonTransfer());
gatewayServer.start();
System.out.println("Gateway server started");
}
}
Python code:-
from pyspark.sql import SQLContext, DataFrame
from pyspark import SparkContext, SparkConf
from py4j.java_gateway import JavaGateway
gateway = JavaGateway()
conf = SparkConf().set('spark.io.encryption.enabled','true')
py_sc = SparkContext(gateway=gateway,conf=conf)
j_df = gateway.getDf()
py_df = DataFrame(j_df,SQLContext(py_sc))
print('print dataframe content')
print(dpy_df.collect())
Command to run python code:-
python path_to_python_file.py
I also tried doing this:-
$SPARK_HOME/bin/spark-submit --master local path_to_python_file.py
But here though the code is not throwing any error but it is not printing anything to terminal. Do I need to set some spark conf for this?
P.S - apologies in advance if there is a typo mistake in code or mistake, since I could not copy the code and error stack directly from my firm's IDE.
There is a missing call to entry_point before calling getDf()
So, try this:
app = gateway.entry_point
j_df = app.getDf()
Additionally, I have create working copy using Python and Scala (hope you dont mind) below that shows how on Scala side py4j gateway is started with Spark session and a sample DataFrame and on Python side I have accessed that DataFrame and converted to Python List[Tuple] before converting back to a DataFrame for a Spark session on Python side:
Python:
from py4j.java_gateway import JavaGateway
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StructField
if __name__ == '__main__':
gateway = JavaGateway()
spark_app = gateway.entry_point
df = spark_app.df()
# Note "apply" method here comes from Scala's companion object to access elements of an array
df_to_list_tuple = [(int(i.apply(0)), int(i.apply(1))) for i in df]
spark = (SparkSession
.builder
.appName("My PySpark App")
.getOrCreate())
schema = StructType([
StructField("a", IntegerType(), True),
StructField("b", IntegerType(), True)])
df = spark.createDataFrame(df_to_list_tuple, schema)
df.show()
Scala:
import java.nio.file.{Path, Paths}
import org.apache.spark.sql.SparkSession
import py4j.GatewayServer
object SparkApp {
val myFile: Path = Paths.get(System.getProperty("user.home") + "/dev/sample_data/games.csv")
val spark = SparkSession.builder()
.master("local[*]")
.appName("My app")
.getOrCreate()
val df = spark
.read
.option("header", "True")
.csv(myFile.toString)
.collect()
}
object Py4JServerApp extends App {
val server = new GatewayServer(SparkApp)
server.start()
print("Started and running...")
}

Reading csv files in PySpark

I am trying to read csv file and convert into dataframe.
input.txt
4324,'Andy',43.5,20.3,53.21
2342,'Sam',22.1
3248,'Jane',11.05,12.87
6457,'Bob',32.1,75.23,71.6
Schema: Id, Name,Jan,Feb,March
As you see the csv file doesn't have "," if there are no trailing expenses.
Code:
from pyspark.sql.types import *
input1= sc.textFile('/FileStore/tables/input.txt').map(lambda x: x.split(","))
schema = StructType([StructField('Id',StringType(),True), StructField('Name',StringType(),True), StructField('Jan',StringType(),True), StructField('Feb',StringType(),True), StructField('Mar',StringType(),True)])
df3 = sqlContext.createDataFrame(input1, schema)
I get ValueError: Length of object (4) does not match with length of fields (5). How do I resolve this?
I would first import the file using pandas which should handle everything for you. From there you can then convert the pandas DataFrame to spark and do all your usual stuff. I copied your example txt file and quickly wrote up some code to confirm that it would all work:
import pandas as pd
# Reading in txt file as csv
df_pandas = pd.read_csv('<your location>/test.txt',
sep=",")
# Converting to spark dataframe and displaying
df_spark = spark.createDataFrame(df_pandas)
display(df_pandas)
Which produced the following output:
The faster method would be to import through spark:
# Importing csv file using pyspark
csv_import = sqlContext.read\
.format('csv')\
.options(sep = ',', header='true', inferSchema='true')\
.load('<your location>/test.txt')
display(csv_import)
Which gives the same output.
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()
fields = [StructField('Id', StringType(), True), StructField('Name', StringType(), True),
StructField('Jan', StringType(), True), StructField('Feb', StringType(), True),
StructField('Mar', StringType(), True)]
schema = StructType(fields)
data = spark.read.format("csv").load("test2.txt")
df3 = spark.createDataFrame(data.rdd, schema)
df3.show()
Output:
+----+------+-----+-----+-----+
| Id| Name| Jan| Feb| Mar|
+----+------+-----+-----+-----+
|4324|'Andy'| 43.5| 20.3|53.21|
|2342| 'Sam'| 22.1| null| null|
|3248|'Jane'|11.05|12.87| null|
|6457| 'Bob'| 32.1|75.23| 71.6|
+----+------+-----+-----+-----+
Here are a couple options for you to consider. These use the wildcard character, so you can loop through all folders and sub-folders, look for files with names that match a specific pattern, and merge everything into a dingle dataframe.
val myDFCsv = spark.read.format("csv")
.option("sep",",")
.option("inferSchema","true")
.option("header","true")
.load("mnt/rawdata/2019/01/01/client/ABC*.gz")
myDFCsv.show()
myDFCsv.head()
myDFCsv.count()
//////////////////////////////////////////
// If you also need to load the filename
import org.apache.spark.sql.functions.input_file_name
val myDFCsv = spark.read.format("csv")
.option("sep",",")
.option("inferSchema","true")
.option("header","true")
.load("mnt/rawdata/2019/01/01/client/ABC*.gz")
.withColumn("file_name",input_file_name())
myDFCsv.show(false)
myDFCsv.head()
myDFCsv.count()

Convert Spark Structured DataFrame to Pandas using pandas_udf

I need to read csv files as stream and then convert that to pandas dataframe.
Here is what I have done so far
DataShema = StructType([ StructField("TimeStamp", LongType(), True), \
StructField("Count", IntegerType(), True), \
StructField("Reading", FloatType(), True) ])
group_columns = ['TimeStamp','Count','Reading']
#pandas_udf(DataShema, PandasUDFType.GROUPED_MAP)
def get_pdf(pdf):
return pd.DataFrame([pdf[group_columns]],columns=[group_columns])
# getting Surge data from the files
SrgDF = spark \
.readStream \
.schema(DataShema) \
.csv("ProcessdedData/SurgeAcc")
mydf = SrgDF.groupby(group_columns).apply(get_pdf)
qrySrg = SrgDF \
.writeStream \
.format("console") \
.start() \
.awaitTermination()
I believe from another source (Convert Spark Structure Streaming DataFrames to Pandas DataFrame) that converting structured streaming dataframe to pandas is not directly possible and it seems that pandas_udf is the right approach but cannot figure out exactly how to achieve this. I need the pandas dataframe to pass into my functions.
Edit
when I run the code (changing the query to mydf rather than SrgDF) then I get the following error: pyspark.sql.utils.StreamingQueryException: 'Writing job aborted.\n=== Streaming Query ===\nIdentifier: [id = 18a15e9e-9762-4464-b6d1-cb2db8d0ac41, runId = e3da131e-00d1-4fed-82fc-65bf377c3f99]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {FileStreamSource[file:/home/mls5/Work_Research/Codes/Misc/Python/MachineLearning_ArtificialIntelligence/00_Examples/01_ApacheSpark/01_ComfortApp/ProcessdedData/SurgeAcc]: {"logOffset":0}}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nFlatMapGroupsInPandas [Count#1], get_pdf(TimeStamp#0L, Count#1, Reading#2), [TimeStamp#10L, Count#11, Reading#12]\n+- Project [Count#1, TimeStamp#0L, Count#1, Reading#2]\n +- StreamingExecutionRelation FileStreamSource[file:/home/mls5/Work_Research/Codes/Misc/Python/MachineLearning_ArtificialIntelligence/00_Examples/01_ApacheSpark/01_ComfortApp/ProcessdedData/SurgeAcc], [TimeStamp#0L, Count#1, Reading#2]\n'
19/05/20 18:32:29 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
/usr/local/lib/python3.6/dist-packages/pyarrow/__init__.py:152: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream
warnings.warn("pyarrow.open_stream is deprecated, please use ".
EDIT-2
Here is the code to reproduce the error
import sys
from pyspark import SparkContext
from pyspark.sql import Row, SparkSession, SQLContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.streaming import StreamingContext
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pyarrow as pa
import glob
#####################################################################################
if __name__ == '__main__' :
spark = SparkSession \
.builder \
.appName("RealTimeIMUAnalysis") \
.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# reduce verbosity
sc = spark.sparkContext
sc.setLogLevel("ERROR")
##############################################################################
# using the saved files to do the Analysis
DataShema = StructType([ StructField("TimeStamp", LongType(), True), \
StructField("Count", IntegerType(), True), \
StructField("Reading", FloatType(), True) ])
group_columns = ['TimeStamp','Count','Reading']
#pandas_udf(DataShema, PandasUDFType.GROUPED_MAP)
def get_pdf(pdf):
return pd.DataFrame([pdf[group_columns]],columns=[group_columns])
# getting Surge data from the files
SrgDF = spark \
.readStream \
.schema(DataShema) \
.csv("SurgeAcc")
mydf = SrgDF.groupby('Count').apply(get_pdf)
#print(mydf)
qrySrg = mydf \
.writeStream \
.format("console") \
.start() \
.awaitTermination()
To run, you need to create a folder named SurgeAcc where the code is and create a csv file inside with the following format:
TimeStamp,Count,Reading
1557011317299,45148,-0.015494
1557011317299,45153,-0.015963
1557011319511,45201,-0.015494
1557011319511,45221,-0.015494
1557011315134,45092,-0.015494
1557011315135,45107,-0.014085
1557011317299,45158,-0.015963
1557011317299,45163,-0.015494
1557011317299,45168,-0.015024`
Your return pandas_udf dataframe is not matching with the schema specified.
Please note that input to the pandas_udf will be pandas dataframe and also returns pandas dataframe.
You can use all pandas functions inside the pandas_udf. Only thing you have to make sure is the ReturnDataShema should match with actual output of the function.
ReturnDataShema = StructType([StructField("TimeStamp", LongType(), True), \
StructField("Count", IntegerType(), True), \
StructField("Reading", FloatType(), True), \
StructField("TotalCount", FloatType(), True)])
#pandas_udf(ReturnDataShema, PandasUDFType.GROUPED_MAP)
def get_pdf(pdf):
# This following stmt is causing schema mismatch
# return pd.DataFrame([pdf[group_columns]],columns=[group_columns])
# If you want to return all the rows of pandas dataframe
# you can simply
# return pdf
# If you want to do any aggregations, you can do like the below, or use pandas query
# but make sure the return pandas dataframe complies with ReturnDataShema
total_count = pdf['Count'].sum()
return pd.DataFrame([(pdf.TimeStamp[0],pdf.Count[0],pdf.Reading[0],total_count)])

Calculating duration by subtracting two datetime columns in string format

I have a Spark Dataframe in that consists of a series of dates:
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *
sqlContext = SQLContext(sc)
import pandas as pd
rdd = sc.parallelizesc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876','sip:4534454450'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321','sip:6413445440'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229','sip:4534437492'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881','sip:6474454453'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323','sip:8874458555')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
What I want to do is find duration by subtracting EndDateTime and StartDateTime. I figured I'd try and do this using a function:
# Function to calculate time delta
def time_delta(y,x):
end = pd.to_datetime(y)
start = pd.to_datetime(x)
delta = (end-start)
return delta
# create new RDD and add new column 'Duration' by applying time_delta function
df2 = df.withColumn('Duration', time_delta(df.EndDateTime, df.StartDateTime))
However this just gives me:
>>> df2.show()
ID EndDateTime StartDateTime ANI Duration
X01 2014-02-13T12:36:... 2014-02-13T12:31:... sip:4534454450 null
X02 2014-02-13T12:35:... 2014-02-13T12:32:... sip:6413445440 null
X03 2014-02-13T12:36:... 2014-02-13T12:32:... sip:4534437492 null
XO4 2014-02-13T12:37:... 2014-02-13T12:32:... sip:6474454453 null
XO5 2014-02-13T12:36:... 2014-02-13T12:33:... sip:8874458555 null
I'm not sure if my approach is correct or not. If not, I'd gladly accept another suggested way to achieve this.
As of Spark 1.5 you can use unix_timestamp:
from pyspark.sql import functions as F
timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS"
timeDiff = (F.unix_timestamp('EndDateTime', format=timeFmt)
- F.unix_timestamp('StartDateTime', format=timeFmt))
df = df.withColumn("Duration", timeDiff)
Note the Java style time format.
>>> df.show()
+---+--------------------+--------------------+--------+
| ID| EndDateTime| StartDateTime|Duration|
+---+--------------------+--------------------+--------+
|X01|2014-02-13T12:36:...|2014-02-13T12:31:...| 258|
|X02|2014-02-13T12:35:...|2014-02-13T12:32:...| 204|
|X03|2014-02-13T12:36:...|2014-02-13T12:32:...| 228|
|XO4|2014-02-13T12:37:...|2014-02-13T12:32:...| 269|
|XO5|2014-02-13T12:36:...|2014-02-13T12:33:...| 202|
+---+--------------------+--------------------+--------+
Thanks to David Griffin. Here's how to do this for future reference.
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf
# Build sample data
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# define timedelta function (obtain duration in seconds)
def time_delta(y,x):
from datetime import datetime
end = datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')
start = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
delta = (end-start).total_seconds()
return delta
# register as a UDF
f = udf(time_delta, IntegerType())
# Apply function
df2 = df.withColumn('Duration', f(df.EndDateTime, df.StartDateTime))
Applying time_delta() will give you duration in seconds:
>>> df2.show()
ID EndDateTime StartDateTime Duration
X01 2014-02-13T12:36:... 2014-02-13T12:31:... 258
X02 2014-02-13T12:35:... 2014-02-13T12:32:... 204
X03 2014-02-13T12:36:... 2014-02-13T12:32:... 228
XO4 2014-02-13T12:37:... 2014-02-13T12:32:... 268
XO5 2014-02-13T12:36:... 2014-02-13T12:33:... 202
datediff(Column end, Column start)
Returns the number of days from start to end.
https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html
This can be done in spark-sql by converting the string date to timestamp and then getting the difference.
1: Convert to timestamp:
CAST(UNIX_TIMESTAMP(MY_COL_NAME,'dd-MMM-yy') as TIMESTAMP)
2: Get the difference between dates using datediff function.
This will be combined in a nested function like:
spark.sql("select COL_1, COL_2, datediff( CAST( UNIX_TIMESTAMP( COL_1,'dd-MMM-yy') as TIMESTAMP), CAST( UNIX_TIMESTAMP( COL_2,'dd-MMM-yy') as TIMESTAMP) ) as LAG_in_days from MyTable")
Below is the result:
+---------+---------+-----------+
| COL_1| COL_2|LAG_in_days|
+---------+---------+-----------+
|24-JAN-17|16-JAN-17| 8|
|19-JAN-05|18-JAN-05| 1|
|23-MAY-06|23-MAY-06| 0|
|18-AUG-06|17-AUG-06| 1|
+---------+---------+-----------+
Reference: https://docs-snaplogic.atlassian.net/wiki/spaces/SD/pages/2458071/Date+Functions+and+Properties+Spark+SQL
Use DoubleType instead of IntegerType
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf
# Build sample data
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# define timedelta function (obtain duration in seconds)
def time_delta(y,x):
from datetime import datetime
end = datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')
start = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
delta = (end-start).total_seconds()
return delta
# register as a UDF
f = udf(time_delta, DoubleType())
# Apply function
df2 = df.withColumn('Duration', f(df.EndDateTime, df.StartDateTime))
Here is a working version for spark 2.x derived from jason's answer
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import StringType, StructType, StructField
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# register as a UDF
from datetime import datetime
sqlContext.registerFunction("time_delta", lambda y,x:(datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')-datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')).total_seconds())
df.createOrReplaceTempView("Test_table")
spark.sql("SELECT ID,EndDateTime,StartDateTime,time_delta(EndDateTime,StartDateTime) as time_delta FROM Test_table").show()
sc.stop()

Resources