Reading csv files in PySpark - apache-spark

I am trying to read csv file and convert into dataframe.
input.txt
4324,'Andy',43.5,20.3,53.21
2342,'Sam',22.1
3248,'Jane',11.05,12.87
6457,'Bob',32.1,75.23,71.6
Schema: Id, Name,Jan,Feb,March
As you see the csv file doesn't have "," if there are no trailing expenses.
Code:
from pyspark.sql.types import *
input1= sc.textFile('/FileStore/tables/input.txt').map(lambda x: x.split(","))
schema = StructType([StructField('Id',StringType(),True), StructField('Name',StringType(),True), StructField('Jan',StringType(),True), StructField('Feb',StringType(),True), StructField('Mar',StringType(),True)])
df3 = sqlContext.createDataFrame(input1, schema)
I get ValueError: Length of object (4) does not match with length of fields (5). How do I resolve this?

I would first import the file using pandas which should handle everything for you. From there you can then convert the pandas DataFrame to spark and do all your usual stuff. I copied your example txt file and quickly wrote up some code to confirm that it would all work:
import pandas as pd
# Reading in txt file as csv
df_pandas = pd.read_csv('<your location>/test.txt',
sep=",")
# Converting to spark dataframe and displaying
df_spark = spark.createDataFrame(df_pandas)
display(df_pandas)
Which produced the following output:
The faster method would be to import through spark:
# Importing csv file using pyspark
csv_import = sqlContext.read\
.format('csv')\
.options(sep = ',', header='true', inferSchema='true')\
.load('<your location>/test.txt')
display(csv_import)
Which gives the same output.

from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()
fields = [StructField('Id', StringType(), True), StructField('Name', StringType(), True),
StructField('Jan', StringType(), True), StructField('Feb', StringType(), True),
StructField('Mar', StringType(), True)]
schema = StructType(fields)
data = spark.read.format("csv").load("test2.txt")
df3 = spark.createDataFrame(data.rdd, schema)
df3.show()
Output:
+----+------+-----+-----+-----+
| Id| Name| Jan| Feb| Mar|
+----+------+-----+-----+-----+
|4324|'Andy'| 43.5| 20.3|53.21|
|2342| 'Sam'| 22.1| null| null|
|3248|'Jane'|11.05|12.87| null|
|6457| 'Bob'| 32.1|75.23| 71.6|
+----+------+-----+-----+-----+

Here are a couple options for you to consider. These use the wildcard character, so you can loop through all folders and sub-folders, look for files with names that match a specific pattern, and merge everything into a dingle dataframe.
val myDFCsv = spark.read.format("csv")
.option("sep",",")
.option("inferSchema","true")
.option("header","true")
.load("mnt/rawdata/2019/01/01/client/ABC*.gz")
myDFCsv.show()
myDFCsv.head()
myDFCsv.count()
//////////////////////////////////////////
// If you also need to load the filename
import org.apache.spark.sql.functions.input_file_name
val myDFCsv = spark.read.format("csv")
.option("sep",",")
.option("inferSchema","true")
.option("header","true")
.load("mnt/rawdata/2019/01/01/client/ABC*.gz")
.withColumn("file_name",input_file_name())
myDFCsv.show(false)
myDFCsv.head()
myDFCsv.count()

Related

Round Spark DataFrame in-place

I read a .csv file to Spark DataFrame. For a DoubleType column is there a way to specify at the time of the file read that this column should be rounded to 2 decimal places? I'm also supplying a custom schema to the DataFrameReader API call. Here's my schema and API calls:
val customSchema = StructType(Array(StructField("id_1", IntegerType, true),
StructField("id_2", IntegerType, true),
StructField("id_3", DoubleType, true)))
#using Spark's CSV reader with custom schema
#spark == SparkSession()
val parsedSchema = spark.read.format("csv").schema(customSchema).option("header", "true").option("nullvalue", "?").load("C:\\Scala\\SparkAnalytics\\block_1.csv")
After the file read into DataFrame I can round the decimals like:
parsedSchema.withColumn("cmp_fname_c1", round($"cmp_fname_c1", 3))
But this creates a new DataFrame, so I'd also like to know if it can be done in-place instead of creating a new DataFrame.
Thanks
You can specify, say, DecimalType(10, 2) for the DoubleType column in your customSchema when loading your CSV file. Let's say you have a CSV file with the following content:
id_1,id_2,Id_3
1,10,5.555
2,20,6.0
3,30,7.444
Sample code below:
import org.apache.spark.sql.types._
val customSchema = StructType(Array(
StructField("id_1", IntegerType, true),
StructField("id_2", IntegerType, true),
StructField("id_3", DecimalType(10, 2), true)
))
spark.read.format("csv").schema(customSchema).
option("header", "true").option("nullvalue", "?").
load("/path/to/csvfile").
show
// +----+----+----+
// |id_1|id_2|id_3|
// +----+----+----+
// | 1| 10|5.56|
// | 2| 20|6.00|
// | 3| 30|7.44|
// +----+----+----+

How to reference a dataframe when in an UDF on another dataframe?

How do you reference a pyspark dataframe when in the execution of an UDF on another dataframe?
Here's a dummy example. I am creating two dataframes scores and lastnames, and within each lies a column that is the same across the two dataframes. In the UDF applied on scores, I want to filter on lastnames and return a string found in lastname.
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sc = SparkContext("local")
sqlCtx = SQLContext(sc)
# Generate Random Data
import itertools
import random
student_ids = ['student1', 'student2', 'student3']
subjects = ['Math', 'Biology', 'Chemistry', 'Physics']
random.seed(1)
data = []
for (student_id, subject) in itertools.product(student_ids, subjects):
data.append((student_id, subject, random.randint(0, 100)))
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
StructField("student_id", StringType(), nullable=False),
StructField("subject", StringType(), nullable=False),
StructField("score", IntegerType(), nullable=False)
])
# Create DataFrame
rdd = sc.parallelize(data)
scores = sqlCtx.createDataFrame(rdd, schema)
# create another dataframe
last_name = ["Granger", "Weasley", "Potter"]
data2 = []
for i in range(len(student_ids)):
data2.append((student_ids[i], last_name[i]))
schema = StructType([
StructField("student_id", StringType(), nullable=False),
StructField("last_name", StringType(), nullable=False)
])
rdd = sc.parallelize(data2)
lastnames = sqlCtx.createDataFrame(rdd, schema)
scores.show()
lastnames.show()
from pyspark.sql.functions import udf
def getLastName(sid):
tmp_df = lastnames.filter(lastnames.student_id == sid)
return tmp_df.last_name
getLastName_udf = udf(getLastName, StringType())
scores.withColumn("last_name", getLastName_udf("student_id")).show(10)
And the following is the last part of the trace:
Py4JError: An error occurred while calling o114.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:335)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:344)
at py4j.Gateway.invoke(Gateway.java:252)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
You can't directly reference a dataframe (or an RDD) from inside a UDF. The DataFrame object is a handle on your driver that spark uses to represent the data and actions that will happen out on the cluster. The code inside your UDF's will run out on the cluster at a time of Spark's choosing. Spark does this by serializing that code, and making copies of any variables included in the closure and sending them out to each worker.
What instead you want to do, is use the constructs Spark provides in it's API to join/combine the two DataFrames. If one of the data sets is small, you can manually send out the data in a broadcast variable, and then access it from your UDF. Otherwise, you can just create the two dataframes like you did, then use the join operation to combine them. Something like this should work:
joined = scores.withColumnRenamed("student_id", "join_id")
joined = joined.join(lastnames, joined.join_id == lastnames.student_id)\
.drop("join_id")
joined.show()
+---------+-----+----------+---------+
| subject|score|student_id|last_name|
+---------+-----+----------+---------+
| Math| 13| student1| Granger|
| Biology| 85| student1| Granger|
|Chemistry| 77| student1| Granger|
| Physics| 25| student1| Granger|
| Math| 50| student2| Weasley|
| Biology| 45| student2| Weasley|
|Chemistry| 65| student2| Weasley|
| Physics| 79| student2| Weasley|
| Math| 9| student3| Potter|
| Biology| 2| student3| Potter|
|Chemistry| 84| student3| Potter|
| Physics| 43| student3| Potter|
+---------+-----+----------+---------+
It's also worth noting, that under the hood Spark DataFrames has an optimization where a DataFrame that is part of a join can be converted to a broadcast variable to avoid a shuffle if it is small enough. So if you do the join method listed above, you should get the best possible performance, without sacrificing the ability to handle larger data sets.
Changing pair to dictionary for easy lookup of names
data2 = {}
for i in range(len(student_ids)):
data2[student_ids[i]] = last_name[i]
Instead of creating rdd and making it to df create broadcast variable
//rdd = sc.parallelize(data2)
//lastnames = sqlCtx.createDataFrame(rdd, schema)
lastnames = sc.broadcast(data2)
Now access this in udf with values attr on broadcast variable(lastnames).
from pyspark.sql.functions import udf
def getLastName(sid):
return lastnames.value[sid]

Pyspark: Transforming PythonRDD to Dataframe

Could someone guide me to convert PythonRDD to a DataFrame.
As per my understanding, reading a file should create a DF, but in my case it has created a PythonRDD. I finding it hard to convert PythonRDD to a DataFrame. Could not find CreateDataFrame() or toDF().
Please find my below code to read a tab seperated text file:
rdd1 = sparkCxt.textFile(setting.REFRESH_HDFS_DIR + "/Refresh")
rdd2 = rdd1.map(lambda row: unicode(row).lower().strip()\
if type(row) == unicode else row)
Now, I would want to convert PythonRDD to a DF.
I wanted to convert to DF to map the schema, so that I could do further processing at column level.
Also, please suggest if you think there is a better approach.
Please reply if more details are required.
Thank you.
Spark DataFrames can be created directly from a text file, but you should use sqlContext instead of sc (SparkContext), since sqlContext is an entry point for working with DataFrames.
df = sqlContext.read.text('path/to/my/file')
This will create a DataFrame with a single column named value. You can use UDF functions to split it into required columns.
Another approach would be to read the text files to an RDD, split it into columns using map, reduce, filter and other operations, and then convert the final RDD to a DataFrame.
For example, let's say we have a RDD named my_rdd with the following structure:
[(1, 'Alice', 23), (2, 'Bob', 25)]
We can easily convert it to a DataFrame:
df = sqlContext.createDataFrame(my_rdd, ['id', 'name', 'age'])
where id, name and age are names for our columns.
you can try using toPandas() although you should be cautious when doing so since converting an rdd to pandas DataFrame will be like bringing all data into memory which might cause OOM error if your distributed data is large.
I would use the Spark-csv package (Spark-csv Github) and import directly into a dataframe after defining the schema.
For example:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)
customSchema = StructType([ \
StructField("year", IntegerType(), True), \
StructField("make", StringType(), True), \
StructField("model", StringType(), True), \
StructField("comment", StringType(), True), \
StructField("blank", StringType(), True)])
df = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true') \
.load('cars.csv', schema = customSchema)
This defaults to a comma for the delimiter, but you can change that to a tab with something like:
df = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true', delimiter='\t') \
.load('cars.csv', schema = customSchema)
Note that it is possible to infer the schema using another option, but this does require reading the entire file prior to loading the dataframe.

How to create an empty DataFrame? Why "ValueError: RDD is empty"?

I am trying to create an empty dataframe in Spark (Pyspark).
I am using similar approach to the one discussed here enter link description here, but it is not working.
This is my code
df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
This is the error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/Me/Desktop/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.py", line 404, in createDataFrame
rdd, schema = self._createFromRDD(data, schema, samplingRatio)
File "/Users/Me/Desktop/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.py", line 285, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio)
File "/Users/Me/Desktop/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.py", line 229, in _inferSchema
first = rdd.first()
File "/Users/Me/Desktop/spark-1.5.1-bin-hadoop2.6/python/pyspark/rdd.py", line 1320, in first
raise ValueError("RDD is empty")
ValueError: RDD is empty
extending Joe Widen's answer, you can actually create the schema with no fields like so:
schema = StructType([])
so when you create the DataFrame using that as your schema, you'll end up with a DataFrame[].
>>> empty = sqlContext.createDataFrame(sc.emptyRDD(), schema)
DataFrame[]
>>> empty.schema
StructType(List())
In Scala, if you choose to use sqlContext.emptyDataFrame and check out the schema, it will return StructType().
scala> val empty = sqlContext.emptyDataFrame
empty: org.apache.spark.sql.DataFrame = []
scala> empty.schema
res2: org.apache.spark.sql.types.StructType = StructType()
At the time this answer was written it looks like you need some sort of schema
from pyspark.sql.types import *
field = [StructField("field1", StringType(), True)]
schema = StructType(field)
sc = spark.sparkContext
sqlContext.createDataFrame(sc.emptyRDD(), schema)
This will work with spark version 2.0.0 or more
from pyspark.sql import SQLContext
sc = spark.sparkContext
schema = StructType([StructField('col1', StringType(), False),StructField('col2', IntegerType(), True)])
sqlContext.createDataFrame(sc.emptyRDD(), schema)
spark.range(0).drop("id")
This creates a DataFrame with an "id" column and no rows then drops the "id" column, leaving you with a truly empty DataFrame.
You can just use something like this:
pivot_table = sparkSession.createDataFrame([("99","99")], ["col1","col2"])
If you want an empty dataframe based on an existing one, simple limit rows to 0.
In PySpark :
emptyDf = existingDf.limit(0)
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType
spark = SparkSession.builder.appName('SparkPractice').getOrCreate()
schema = StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])
df = spark.createDataFrame(spark.sparkContext.emptyRDD(),schema)
df.printSchema()
This is a roundabout but simple way to create an empty spark df with an inferred schema
# Initialize a spark df using one row of data with the desired schema
init_sdf = spark.createDataFrame([('a_string', 0, 0)], ['name', 'index', 'seq_#'])
# remove the row. Leaves the schema
empty_sdf = init_sdf.where(col('name') == 'not_match')
empty_sdf.printSchema()
# Output
root
|-- name: string (nullable = true)
|-- index: long (nullable = true)
|-- seq_#: long (nullable = true)
Seq.empty[String].toDF()
This will create a empty df. Helpful for testing purposes and all. (Scala-Spark)
In Spark 3.1.2, the spark.sparkContext.emptyRDD() function throws an error. Using the schema, passing an empty list will work:
df = spark.createDataFrame([], schema)
You can do it by loading an empty file (parquet, json etc.) like this:
df = sqlContext.read.json("my_empty_file.json")
Then when you try to check the schema you'll see:
>>> df.printSchema()
root
In Scala/Java not passing a path should work too, in Python it throws an exception. Also if you ever switch to Scala/Python you can use this method to create one.
You can create an empty data frame by using following syntax in pyspark:
df = spark.createDataFrame([], ["col1", "col2", ...])
where [] represents the empty value for col1 and col2. Then you can register as temp view for your sql queries:
**df2.createOrReplaceTempView("artist")**

Calculating duration by subtracting two datetime columns in string format

I have a Spark Dataframe in that consists of a series of dates:
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *
sqlContext = SQLContext(sc)
import pandas as pd
rdd = sc.parallelizesc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876','sip:4534454450'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321','sip:6413445440'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229','sip:4534437492'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881','sip:6474454453'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323','sip:8874458555')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
What I want to do is find duration by subtracting EndDateTime and StartDateTime. I figured I'd try and do this using a function:
# Function to calculate time delta
def time_delta(y,x):
end = pd.to_datetime(y)
start = pd.to_datetime(x)
delta = (end-start)
return delta
# create new RDD and add new column 'Duration' by applying time_delta function
df2 = df.withColumn('Duration', time_delta(df.EndDateTime, df.StartDateTime))
However this just gives me:
>>> df2.show()
ID EndDateTime StartDateTime ANI Duration
X01 2014-02-13T12:36:... 2014-02-13T12:31:... sip:4534454450 null
X02 2014-02-13T12:35:... 2014-02-13T12:32:... sip:6413445440 null
X03 2014-02-13T12:36:... 2014-02-13T12:32:... sip:4534437492 null
XO4 2014-02-13T12:37:... 2014-02-13T12:32:... sip:6474454453 null
XO5 2014-02-13T12:36:... 2014-02-13T12:33:... sip:8874458555 null
I'm not sure if my approach is correct or not. If not, I'd gladly accept another suggested way to achieve this.
As of Spark 1.5 you can use unix_timestamp:
from pyspark.sql import functions as F
timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS"
timeDiff = (F.unix_timestamp('EndDateTime', format=timeFmt)
- F.unix_timestamp('StartDateTime', format=timeFmt))
df = df.withColumn("Duration", timeDiff)
Note the Java style time format.
>>> df.show()
+---+--------------------+--------------------+--------+
| ID| EndDateTime| StartDateTime|Duration|
+---+--------------------+--------------------+--------+
|X01|2014-02-13T12:36:...|2014-02-13T12:31:...| 258|
|X02|2014-02-13T12:35:...|2014-02-13T12:32:...| 204|
|X03|2014-02-13T12:36:...|2014-02-13T12:32:...| 228|
|XO4|2014-02-13T12:37:...|2014-02-13T12:32:...| 269|
|XO5|2014-02-13T12:36:...|2014-02-13T12:33:...| 202|
+---+--------------------+--------------------+--------+
Thanks to David Griffin. Here's how to do this for future reference.
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf
# Build sample data
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# define timedelta function (obtain duration in seconds)
def time_delta(y,x):
from datetime import datetime
end = datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')
start = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
delta = (end-start).total_seconds()
return delta
# register as a UDF
f = udf(time_delta, IntegerType())
# Apply function
df2 = df.withColumn('Duration', f(df.EndDateTime, df.StartDateTime))
Applying time_delta() will give you duration in seconds:
>>> df2.show()
ID EndDateTime StartDateTime Duration
X01 2014-02-13T12:36:... 2014-02-13T12:31:... 258
X02 2014-02-13T12:35:... 2014-02-13T12:32:... 204
X03 2014-02-13T12:36:... 2014-02-13T12:32:... 228
XO4 2014-02-13T12:37:... 2014-02-13T12:32:... 268
XO5 2014-02-13T12:36:... 2014-02-13T12:33:... 202
datediff(Column end, Column start)
Returns the number of days from start to end.
https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html
This can be done in spark-sql by converting the string date to timestamp and then getting the difference.
1: Convert to timestamp:
CAST(UNIX_TIMESTAMP(MY_COL_NAME,'dd-MMM-yy') as TIMESTAMP)
2: Get the difference between dates using datediff function.
This will be combined in a nested function like:
spark.sql("select COL_1, COL_2, datediff( CAST( UNIX_TIMESTAMP( COL_1,'dd-MMM-yy') as TIMESTAMP), CAST( UNIX_TIMESTAMP( COL_2,'dd-MMM-yy') as TIMESTAMP) ) as LAG_in_days from MyTable")
Below is the result:
+---------+---------+-----------+
| COL_1| COL_2|LAG_in_days|
+---------+---------+-----------+
|24-JAN-17|16-JAN-17| 8|
|19-JAN-05|18-JAN-05| 1|
|23-MAY-06|23-MAY-06| 0|
|18-AUG-06|17-AUG-06| 1|
+---------+---------+-----------+
Reference: https://docs-snaplogic.atlassian.net/wiki/spaces/SD/pages/2458071/Date+Functions+and+Properties+Spark+SQL
Use DoubleType instead of IntegerType
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf
# Build sample data
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# define timedelta function (obtain duration in seconds)
def time_delta(y,x):
from datetime import datetime
end = datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')
start = datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')
delta = (end-start).total_seconds()
return delta
# register as a UDF
f = udf(time_delta, DoubleType())
# Apply function
df2 = df.withColumn('Duration', f(df.EndDateTime, df.StartDateTime))
Here is a working version for spark 2.x derived from jason's answer
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import StringType, StructType, StructField
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()
rdd = sc.parallelize([('X01','2014-02-13T12:36:14.899','2014-02-13T12:31:56.876'),
('X02','2014-02-13T12:35:37.405','2014-02-13T12:32:13.321'),
('X03','2014-02-13T12:36:03.825','2014-02-13T12:32:15.229'),
('XO4','2014-02-13T12:37:05.460','2014-02-13T12:32:36.881'),
('XO5','2014-02-13T12:36:52.721','2014-02-13T12:33:30.323')])
schema = StructType([StructField('ID', StringType(), True),
StructField('EndDateTime', StringType(), True),
StructField('StartDateTime', StringType(), True)])
df = sqlContext.createDataFrame(rdd, schema)
# register as a UDF
from datetime import datetime
sqlContext.registerFunction("time_delta", lambda y,x:(datetime.strptime(y, '%Y-%m-%dT%H:%M:%S.%f')-datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f')).total_seconds())
df.createOrReplaceTempView("Test_table")
spark.sql("SELECT ID,EndDateTime,StartDateTime,time_delta(EndDateTime,StartDateTime) as time_delta FROM Test_table").show()
sc.stop()

Resources