Select columns that satisfy a condition - apache-spark

I'm running the following notebook in zeppelin:
%spark.pyspark
l = [('user1', 33, 1.0, 'chess'), ('user2', 34, 2.0, 'tenis'), ('user3', None, None, ''), ('user4', None, 4.0, ' '), ('user5', None, 5.0, 'ski')]
df = spark.createDataFrame(l, ['name', 'age', 'ratio', 'hobby'])
df.show()
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- ratio: double (nullable = true)
|-- hobby: string (nullable = true)
+-----+----+-----+-----+
| name| age|ratio|hobby|
+-----+----+-----+-----+
|user1| 33| 1.0|chess|
|user2| 34| 2.0|tenis|
|user3|null| null| |
|user4|null| 4.0| |
|user5|null| 5.0| ski|
+-----+----+-----+-----+
agg_df = df.select(*[(1.0 - (count(c) / count('*'))).alias(c) for c in df.columns])
agg_df.show()
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- ratio: double (nullable = true)
|-- hobby: string (nullable = true)
+----+---+-------------------+-----+
|name|age| ratio|hobby|
+----+---+-------------------+-----+
| 0.0|0.6|0.19999999999999996| 0.0|
+----+---+-------------------+-----+
Now, I want to select in agg_df only columns which value is < 0.35. In this case it should return ['name', 'ratio', 'hobby']
I can't figure out how to do it. Any hint?

you mean values < 0.35?. This should do
>>> [ key for (key,value) in agg_df.collect()[0].asDict().items() if value < 0.35 ]
['hobby', 'ratio', 'name']
to replace blank string with Null use the following udf function.
from pyspark.sql.functions import udf
process = udf(lambda x: None if not x else (x if x.strip() else None))
df.withColumn('hobby', process(df.hobby)).show()
+-----+----+-----+-----+
| name| age|ratio|hobby|
+-----+----+-----+-----+
|user1| 33| 1.0|chess|
|user2| 34| 2.0|tenis|
|user3|null| null| null|
|user4|null| 4.0| null|
|user5|null| 5.0| ski|
+-----+----+-----+-----+

Here is my attempt for the function I was looking for based on rogue-one indications. Not sure if it is the fastest or most optimized:
from pyspark.sql.functions import udf, count
from functools import reduce
def filter_columns(df, threshold=0.35):
process = udf(lambda x: None if not x else (x if x.strip() else None)) # udf for stripping string values
string_cols = ([c for c in df.columns if df.select(c).dtypes[0][1] == 'string']) # string columns
new_df = reduce(lambda df, x: df.withColumn(x, process(x)), string_cols, df) # process all string columns
agg_df = new_df.select(*[(1.0 - (count(c) / count('*'))).alias(c) for c in new_df.columns]) # compute non-null/df.count ratio
cols_match_threshold = [ key for (key, value) in agg_df.collect()[0].asDict().items() if value < threshold ] # select only cols which value < threshold
return new_df.select(cols_match_threshold)
filter_columns(df, 0.35).show()
+-----+-----+
|ratio| name|
+-----+-----+
| 1.0|user1|
| 2.0|user2|
| null|user3|
| 4.0|user4|
| 5.0|user5|
+-----+-----+

Related

spark schema difference in partitions

I have to read data from a path which is partitioned by region.
US region has columns a,b,c,d,e
EUR region has only a,b,c,d
When I read data from the path and doing a printSchema, I am seeing only a,b,c,d 'e' is missing.
Is there any way to handle this situation? Like column e automatically gets populated with null for EUR data...?
You can use the mergeSchema option that should do exactly what you are looking for as long as columns with the same name have the same type.
Example:
spark.read.option("mergeSchema", "true").format("parquet").load(...)
Once you read the data from the path, you can check if data frame contains column 'e'. If it does not, then you could add this with default value which is None is this case.
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
spark = SparkSession.builder \
.appName('example') \
.getOrCreate()
df = spark.createDataFrame(data=data, schema = columns)
if 'e' not in df.columns:
df = df.withColumn('e',lit(None))
You can collect all the possible columns from both dataset then fill None if that column is not available in each dataset
df_ab = (spark
.sparkContext
.parallelize([
('a1', 'b1'),
('a2', 'b2'),
])
.toDF(['a', 'b'])
)
df_ab.show()
# +---+---+
# | a| b|
# +---+---+
# | a1| b1|
# | a2| b2|
# +---+---+
df_abcd = (spark
.sparkContext
.parallelize([
('a3', 'b3', 'c3', 'd3'),
('a4', 'b4', 'c4', 'd4'),
])
.toDF(['a', 'b', 'c', 'd'])
)
df_abcd.show()
# +---+---+---+---+
# | a| b| c| d|
# +---+---+---+---+
# | a3| b3| c3| d3|
# | a4| b4| c4| d4|
# +---+---+---+---+
unique_columns = list(set(df_ab.columns + df_abcd.columns))
# ['d', 'b', 'a', 'c']
for col in unique_columns:
if col not in df_ab.columns:
df_ab = df_ab.withColumn(col, F.lit(None))
if col not in df_abcd.columns:
df_abcd = df_abcd.withColumn(col, F.lit(None))
df_ab.printSchema()
# root
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- d: null (nullable = true)
# |-- c: null (nullable = true)
df_ab.show()
# +---+---+----+----+
# | a| b| d| c|
# +---+---+----+----+
# | a1| b1|null|null|
# | a2| b2|null|null|
# +---+---+----+----+
df_abcd.printSchema()
# root
# |-- a: string (nullable = true)
# |-- b: string (nullable = true)
# |-- c: string (nullable = true)
# |-- d: string (nullable = true)
df_abcd.show()
# +---+---+---+---+
# | a| b| c| d|
# +---+---+---+---+
# | a3| b3| c3| d3|
# | a4| b4| c4| d4|
# +---+---+---+---+
I used pyspark and SQLContext. Hope this implementation will help you to get an idea. Spark provides an environment to use SQL and it is very convenient to use SPARK SQL for these type of things.
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql import SQLContext
import sys
import os
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
class getData(object):
"""docstring for getData"""
def __init__(self):
def get_data(self, n):
spark = SparkSession.builder.appName('YourProjectName').getOrCreate()
data2 = [("region 1","region 2","region 3","region 4"),
("region 5","region 6","region 7","region 8")
]
schema = StructType([ \
StructField("a",StringType(),True), \
StructField("b",StringType(),True), \
StructField("c",StringType(),True), \
StructField("d", StringType(), True) \
])
data3 = [("EU region 1","EU region 2","EU region 3"),
("EU region 5","EU region 6","EU region 7")
]
schema3 = StructType([ \
StructField("a",StringType(),True), \
StructField("b",StringType(),True), \
StructField("c",StringType(),True) \
])
df = spark.createDataFrame(data=data2,schema=schema)
df.createOrReplaceTempView("USRegion")
sqlDF = self.sparkSession1.sql("SELECT * FROM USRegion")
sqlDF.show(n=600)
df1 = spark.createDataFrame(data=data3,schema=schema3)
df1.createOrReplaceTempView("EURegion")
sqlDF1 = self.sparkSession1.sql("SELECT * FROM EURegion")
sqlDF1.show(n=600)
sql_union_df = self.sparkSession1.sql("SELECT a, b, c, d FROM USRegion uNION ALL SELECT a,b, c, '' as d FROM EURegion ")
sql_union_df.show(n=600)
#call the class
conn = getData()
#call the method implemented inside the class
print(conn.get_data(10))

How do you remove an ambiguous column in pyspark?

There are many questions similar to this that are asking a different question with regard to avoid duplicate columns in a join; that is not what I am asking here.
Given that I already have a DataFrame with ambiguous columns, how do I remove a specific column?
For example, given:
df = spark.createDataFrame(
spark.sparkContext.parallelize([
[1, 0.0, "ext-0.0"],
[1, 1.0, "ext-1.0"],
[2, 1.0, "ext-2.0"],
[3, 2.0, "ext-3.0"],
[4, 3.0, "ext-4.0"],
]),
StructType([
StructField("id", IntegerType(), True),
StructField("shared", DoubleType(), True),
StructField("shared", StringType(), True),
])
)
I wish to retain only the numeric columns.
However, attempting to do something like df.select("id", "shared").show() results in:
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: "Reference 'shared' is ambiguous, could be: shared, shared.;"
Many related solution to this problem are simply 'avoid ever getting into this situation', eg. by using ['joinkey'] instead of a.joinkey = b.joinkey on the join. I reiterate that this is not the situation here; this relates to a dataframe that has already been converted into this form.
The metadata from the DF disambiguates these columns:
$ df.dtypes
[('id', 'int'), ('shared', 'double'), ('shared', 'string')]
$ df.schema
StructType(List(StructField(id,IntegerType,true),StructField(shared,DoubleType,true),StructField(shared,StringType,true)))
So the data is retained internally... I just can't see how to use it.
How do I pick one column over the other?
I expected to be able to use, eg. col('shared#11') or similar... but there is nothing like that I can see?
Is this simply not possible in spark?
To answer this question, I would ask, please post either a) a working code snippet that solves the problem above, or b) link to something official from the spark developers that this simply isn't supported?
the easiest solution to this problem is to rename using df.toDF(...<new-col-names>...), but if you don't wanted to change the column name then group the duplicated columns by their type as struct<type1, type2> as below-
Please note that below solution is written in scala, but logically similar code can be implemented in python. Also this solution will work for all duplicate columns in the dataframe-
1. Load the test data
val df = Seq((1, 2.0, "shared")).toDF("id", "shared", "shared")
df.show(false)
df.printSchema()
/**
* +---+------+------+
* |id |shared|shared|
* +---+------+------+
* |1 |2.0 |shared|
* +---+------+------+
*
* root
* |-- id: integer (nullable = false)
* |-- shared: double (nullable = false)
* |-- shared: string (nullable = true)
*/
2. get all the duplicated column names
// 1. get all the duplicated column names
val findDupCols = (cols: Array[String]) => cols.map((_ , 1)).groupBy(_._1).filter(_._2.length > 1).keys.toSeq
val dupCols = findDupCols(df.columns)
println(dupCols.mkString(", "))
// shared
3. rename duplicate cols like shared => shared:string, shared:int, without touching the other column names
val renamedDF = df
// 2 rename duplicate cols like shared => shared:string, shared:int
.toDF(df.schema
.map{case StructField(name, dt, _, _) =>
if(dupCols.contains(name)) s"$name:${dt.simpleString}" else name}: _*)
3. create struct of all cols
// 3. create struct of all cols
val structCols = df.schema.map(f => f.name -> f ).groupBy(_._1)
.map{case(name, seq) =>
if (seq.length > 1)
struct(
seq.map { case (_, StructField(fName, dt, _, _)) =>
expr(s"`$fName:${dt.simpleString}` as ${dt.simpleString}")
}: _*
).as(name)
else col(name)
}.toSeq
val structDF = renamedDF.select(structCols: _*)
structDF.show(false)
structDF.printSchema()
/**
* +-------------+---+
* |shared |id |
* +-------------+---+
* |[2.0, shared]|1 |
* +-------------+---+
*
* root
* |-- shared: struct (nullable = false)
* | |-- double: double (nullable = false)
* | |-- string: string (nullable = true)
* |-- id: integer (nullable = false)
*/
4. get column by their type using <column_name>.<datatype>
// Use the dataframe without losing any columns
structDF.selectExpr("id", "shared.double as shared").show(false)
/**
* +---+------+
* |id |shared|
* +---+------+
* |1 |2.0 |
* +---+------+
*/
Hope this is useful to someone!
It seems this is possible by replacing the schema using .rdd.toDf() on the dataframe.
However, I'll still accept any answer that is less convoluted and annoying than the one below:
import random
import string
from pyspark.sql.types import DoubleType, LongType
def makeId():
return ''.join(random.choice(string.ascii_lowercase) for _ in range(6))
def makeUnique(column):
return "%s---%s" % (column.name, makeId())
def makeNormal(column):
return column.name.split("---")[0]
unique_schema = list(map(makeUnique, df.schema))
df_unique = df.rdd.toDF(schema=unique_schema)
df_unique.show()
numeric_cols = filter(lambda c: c.dataType.__class__ in [LongType, DoubleType], df_unique.schema)
numeric_col_names = list(map(lambda c: c.name, numeric_cols))
df_filtered = df_unique.select(*numeric_col_names)
df_filtered.show()
normal_schema = list(map(makeNormal, df_filtered.schema))
df_fixed = df_filtered.rdd.toDF(schema=normal_schema)
df_fixed.show()
Gives:
+-----------+---------------+---------------+
|id---chjruu|shared---aqboua|shared---ehjxor|
+-----------+---------------+---------------+
| 1| 0.0| ext-0.0|
| 1| 1.0| ext-1.0|
| 2| 1.0| ext-2.0|
| 3| 2.0| ext-3.0|
| 4| 3.0| ext-4.0|
+-----------+---------------+---------------+
+-----------+---------------+
|id---chjruu|shared---aqboua|
+-----------+---------------+
| 1| 0.0|
| 1| 1.0|
| 2| 1.0|
| 3| 2.0|
| 4| 3.0|
+-----------+---------------+
+---+------+
| id|shared|
+---+------+
| 1| 0.0|
| 1| 1.0|
| 2| 1.0|
| 3| 2.0|
| 4| 3.0|
+---+------+
Workaround: Simply rename the columns (in order) and then do whatever you wanted to do!
renamed_df = df.toDF("id", "shared_double", "shared_string")

Spark SQL not equals not working on column created using lag

I am trying to find missing sequences in a CSV File and here is the code i have
val customSchema = StructType(
Array(
StructField("MessageId", StringType, false),
StructField("msgSeqID", LongType, false),
StructField("site", StringType, false),
StructField("msgType", StringType, false)
)
)
val logFileDF = sparkSession.sqlContext.read.format("csv")
.option("delimiter",",")
.option("header", false)
.option("mode", "DROPMALFORMED")
.schema(customSchema)
.load(logFilePath)
.toDF()
logFileDF.printSchema()
logFileDF.repartition(5000)
logFileDF.createOrReplaceTempView("LogMessageData")
sparkSession.sqlContext.cacheTable("LogMessageData")
val selectQuery: String = "SELECT MessageId,site,msgSeqID,msgType,lag(msgSeqID,1) over (partition by site,msgType order by site,msgType,msgSeqID) as prev_val FROM LogMessageData order by site,msgType,msgSeqID"
val logFileLagRowNumDF = sparkSession.sqlContext.sql(selectQuery).toDF()
logFileLagRowNumDF.repartition(1000)
logFileLagRowNumDF.printSchema()
logFileLagRowNumDF.createOrReplaceTempView("LogMessageDataUpdated")
sparkSession.sqlContext.cacheTable("LogMessageDataUpdated")
val errorRecordQuery: String = "select * from LogMessageDataUpdated where prev_val!=null and msgSeqID != prev_val+1 order by site,msgType,msgSeqID";
val errorRecordQueryDF = sparkSession.sqlContext.sql(errorRecordQuery).toDF()
logger.info("Total. No.of Missing records =[" + errorRecordQueryDF.count() + "]")
val noOfMissingRecords = errorRecordQueryDF.count()
Here is the sample Data i have
Msg_1,S1,A,10000000
Msg_2,S1,A,10000002
Msg_3,S2,A,10000003
Msg_4,S3,B,10000000
Msg_5,S3,B,10000001
Msg_6,S3,A,10000003
Msg_7,S3,A,10000001
Msg_8,S3,A,10000002
Msg_9,S4,A,10000000
Msg_10,S4,A,10000001
Msg_11,S4,A,10000000
Msg_12,S4,A,10000005
And here is the output i am getting.
root
|-- MessageId: string (nullable = true)
|-- site: string (nullable = true)
|-- msgType: string (nullable = true)
|-- msgSeqID: long (nullable = true)
INFO EimComparisonProcessV2 - Total No.Of Records =[12]
+---------+----+-------+--------+
|MessageId|site|msgType|msgSeqID|
+---------+----+-------+--------+
| Msg_1| S1| A|10000000|
| Msg_2| S1| A|10000002|
| Msg_3| S2| A|10000003|
| Msg_4| S3| B|10000000|
| Msg_5| S3| B|10000001|
| Msg_6| S3| A|10000003|
| Msg_7| S3| A|10000001|
| Msg_8| S3| A|10000002|
| Msg_9| S4| A|10000000|
| Msg_10| S4| A|10000001|
| Msg_11| S4| A|10000000|
| Msg_12| S4| A|10000005|
+---------+----+-------+--------+
root
|-- MessageId: string (nullable = true)
|-- site: string (nullable = true)
|-- msgSeqID: long (nullable = true)
|-- msgType: string (nullable = true)
|-- prev_val: long (nullable = true)
+---------+----+--------+-------+--------+
|MessageId|site|msgSeqID|msgType|prev_val|
+---------+----+--------+-------+--------+
| Msg_1| S1|10000000| A| null|
| Msg_2| S1|10000002| A|10000000|
| Msg_3| S2|10000003| A| null|
| Msg_7| S3|10000001| A| null|
| Msg_8| S3|10000002| A|10000001|
+---------+----+--------+-------+--------+
only showing top 5 rows
INFO TestProcess - Total No.Of Records Updated DF=[12]
INFO TestProcess - Total. No.of Missing records =[0]
Just replace your query with this line of code it will show you the results.
val errorRecordQuery: String = "select * from LogMessageDataUpdated where prev_val is not null and msgSeqID <> prev_val+1 order by site,msgType,msgSeqID";

How to convert RDD[Array[Any]] to DataFrame?

I have RDD[Array[Any]] as follows,
1556273771,Mumbai,1189193,1189198,0.56,-1,India,Australia,1571215104,1571215166
8374749403,London,1189193,1189198,0,1,India,England,4567362933,9374749392
7439430283,Dubai,1189193,1189198,0.76,-1,Pakistan,Sri Lanka,1576615684,4749383749
I need to convert this to a data frame of 10 columns, but I am new to spark. Please let me know how to do this in the simplest way.
I am trying something similar to this code:
rdd_data.map{case Array(a,b,c,d,e,f,g,h,i,j) => (a,b,c,d,e,f,g,h,i,j)}.toDF()
When you create a dataframe, Spark needs to know the data type of each column. "Any" type is just a way of saying that you don't know the variable type. A possible solution is to cast each value to a specific type. This will of course fail if the specified cast is invalid.
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
val rdd1 = spark.sparkContext.parallelize(
Array(
Array(1556273771L,"Mumbai",1189193,1189198 ,0.56,-1,"India", "Australia",1571215104L,1571215166L),
Array(8374749403L,"London",1189193,1189198 ,0 , 1,"India", "England", 4567362933L,9374749392L),
Array(7439430283L,"Dubai" ,1189193,1189198 ,0.76,-1,"Pakistan","Sri Lanka",1576615684L,4749383749L)
),1)
//rdd1: org.apache.spark.rdd.RDD[Array[Any]]
val rdd2 = rdd1.map(r => Row(
r(0).toString.toLong,
r(1).toString,
r(2).toString.toInt,
r(3).toString.toInt,
r(4).toString.toDouble,
r(5).toString.toInt,
r(6).toString,
r(7).toString,
r(8).toString.toLong,
r(9).toString.toLong
))
val schema = StructType(
List(
StructField("col0", LongType, false),
StructField("col1", StringType, false),
StructField("col2", IntegerType, false),
StructField("col3", IntegerType, false),
StructField("col4", DoubleType, false),
StructField("col5", IntegerType, false),
StructField("col6", StringType, false),
StructField("col7", StringType, false),
StructField("col8", LongType, false),
StructField("col9", LongType, false)
)
)
val df = spark.createDataFrame(rdd2, schema)
df.show
+----------+------+-------+-------+----+----+--------+---------+----------+----------+
| col0| col1| col2| col3|col4|col5| col6| col7| col8| col9|
+----------+------+-------+-------+----+----+--------+---------+----------+----------+
|1556273771|Mumbai|1189193|1189198|0.56| -1| India|Australia|1571215104|1571215166|
|8374749403|London|1189193|1189198| 0.0| 1| India| England|4567362933|9374749392|
|7439430283| Dubai|1189193|1189198|0.76| -1|Pakistan|Sri Lanka|1576615684|4749383749|
+----------+------+-------+-------+----+----+--------+---------+----------+----------+
df.printSchema
root
|-- col0: long (nullable = false)
|-- col1: string (nullable = false)
|-- col2: integer (nullable = false)
|-- col3: integer (nullable = false)
|-- col4: double (nullable = false)
|-- col5: integer (nullable = false)
|-- col6: string (nullable = false)
|-- col7: string (nullable = false)
|-- col8: long (nullable = false)
|-- col9: long (nullable = false)
Hope it helps
As the other posts mention, a DataFrame requires explicit types for each column, so you can't use Any. The easiest way I can think of would be to turn each row into a tuple of the right types then use implicit DF creation to convert to a DataFrame. You were pretty close in your code, you just need to cast the elements to an acceptable type.
Basically toDF knows how to convert tuples (with accepted types) into a DF Row, and you can pass the column names into the toDF call.
For example:
val data = Array(1556273771, "Mumbai", 1189193, 1189198, 0.56, -1, "India,Australia", 1571215104, 1571215166)
val rdd = sc.parallelize(Seq(data))
val df = rdd.map {
case Array(a,b,c,d,e,f,g,h,i) => (
a.asInstanceOf[Int],
b.asInstanceOf[String],
c.asInstanceOf[Int],
d.asInstanceOf[Int],
e.toString.toDouble,
f.asInstanceOf[Int],
g.asInstanceOf[String],
h.asInstanceOf[Int],
i.asInstanceOf[Int]
)
}.toDF("int1", "city", "int2", "int3", "float1", "int4", "country", "int5", "int6")
df.printSchema
df.show(100, false)
scala> df.printSchema
root
|-- int1: integer (nullable = false)
|-- city: string (nullable = true)
|-- int2: integer (nullable = false)
|-- int3: integer (nullable = false)
|-- float1: double (nullable = false)
|-- int4: integer (nullable = false)
|-- country: string (nullable = true)
|-- int5: integer (nullable = false)
|-- int6: integer (nullable = false)
scala> df.show(100, false)
+----------+------+-------+-------+------+----+---------------+----------+----------+
|int1 |city |int2 |int3 |float1|int4|country |int5 |int6 |
+----------+------+-------+-------+------+----+---------------+----------+----------+
|1556273771|Mumbai|1189193|1189198|0.56 |-1 |India,Australia|1571215104|1571215166|
+----------+------+-------+-------+------+----+---------------+----------+----------+
Edit for 0 -> Double:
As André pointed out, if you start off with 0 as an Any it will be a java Integer, not a scala Int, and therefore not castable to a scala Double. Converting it to a string first lets you then convert it into a double as desired.
You can try below approach, it's a bit tricky but without bothering with schema.
Map Any to String using toDF(), create DataFrame of arrays then create new columns by selecting each element from array column.
val rdd: RDD[Array[Any]] = spark.range(5).rdd.map(s => Array(s,s+1,s%2))
val size = rdd.first().length
def splitCol(col: Column): Seq[(String, Column)] = {
(for (i <- 0 to size - 1) yield ("_" + i, col(i)))
}
import spark.implicits._
rdd.map(s=>s.map(s=>s.toString()))
.toDF("x")
.select(splitCol('x).map(_._2):_*)
.toDF(splitCol('x).map(_._1):_*)
.show()
+---+---+---+
| _0| _1| _2|
+---+---+---+
| 0| 1| 0|
| 1| 2| 1|
| 2| 3| 0|
| 3| 4| 1|
| 4| 5| 0|
+---+---+---+

How to perform calculation in spark dataframe that select from its own dataframe using pyspark

I have a pyspark schema which look like this :
root
|-- id: string (nullable = true)
|-- long: float (nullable = true)
|-- lat: float (nullable = true)
|-- geohash: string (nullable = true)
|-- neighbors: array (nullable = true)
| |-- element: string (containsNull = true)
The data look like this :
+---+---------+----------+---------+--------------------+
| id| lat| long|geohash_8| neighbors|
+---+---------+----------+---------+--------------------+
| 0|-6.361755| 106.79653| qqggy1yu|[qqggy1ys, qqggy1...|
| 1|-6.358584|106.793945| qqggy4ky|[qqggy4kw, qqggy4...|
| 2|-6.362967|106.798775| qqggy38m|[qqggy38j, qqggy3...|
| 3|-6.358316| 106.79832| qqggy680|[qqggy4xb, qqggy6...|
| 4| -6.36016| 106.7981| qqggy60j|[qqggy4pv, qqggy6...|
| 5|-6.357476| 106.79842| qqggy68j|[qqggy4xv, qqggy6...|
| 6|-6.360814| 106.79435| qqggy4j3|[qqggy4j1, qqggy4...|
| 7|-6.358231|106.794365| qqggy4t2|[qqggy4t0, qqggy4...|
| 8|-6.357654| 106.79736| qqggy4x7|[qqggy4x5, qqggy4...|
| 9|-6.358781|106.794624| qqggy4mm|[qqggy4mj, qqggy4...|
| 10|-6.357654| 106.79443| qqggy4t7|[qqggy4t5, qqggy4...|
| 11|-6.357079| 106.79443| qqggy4tr|[qqggy4tp, qqggy4...|
| 12|-6.359929| 106.79698| qqggy4pn|[qqggy4ny, qqggy4...|
| 13|-6.358111| 106.79633| qqggy4w9|[qqggy4w3, qqggy4...|
| 14|-6.359685| 106.79607| qqggy4q8|[qqggy4q2, qqggy4...|
| 15|-6.357945|106.794945| qqggy4td|[qqggy4t6, qqggy4...|
| 16|-6.360725|106.795456| qqggy4n4|[qqggy4jf, qqggy4...|
| 17|-6.363701| 106.79653| qqggy1wb|[qqggy1w8, qqggy1...|
| 18| -6.36329|106.794586| qqggy1t7|[qqggy1t5, qqggy1...|
| 19|-6.363304| 106.79429| qqggy1t5|[qqggy1sg, qqggy1...|
+---+---------+----------+---------+--------------------+
I want to calculate the distance from each id with its lat long and select all the lat long from all his neighbors then calculate the distance. Then every id will have list of distances in meters with all his neighbors.
I tried using iterative way, which loop every rows then select a dataframe then compute the haversine distance, However the performance is awful. I am stuck on how to apply using functional way in spark. Can anyone help with some suggestion or references.
Updated to address desire for combinations
If you want to do all the combinations, the steps are basically, associate each neighbor ID with it's lat/long, group them together into a single row for each combination set, then do compute distance on all the combinations. Here is example code:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import itertools
schema = StructType([
StructField("id", StringType()),
StructField("lat", FloatType()),
StructField("long", FloatType()),
StructField("geohash_8", StringType()),
StructField("neighbors", ArrayType(StringType()))
])
data = [
("0", 10.0, 11.0, "A", ["B", "C", "D"]),
("1", 12.0, 13.0, "B", ["D"]),
("2", 14.0, 15.0, "C", []),
("3", 16.0, 17.0, "D", [])
]
input_df = spark.createDataFrame(sc.parallelize(data), schema)
# Explode to get a row for each comparison pair
df = input_df.withColumn('neighbor', explode('neighbors')).drop('neighbors')
# Join to get the lat/lon of the neighbor
neighbor_map = input_df.selectExpr('geohash_8 as nid', 'lat as nlat', 'long as nlong')
df = df.join(neighbor_map , col('neighbor') == col('nid'), 'inner').drop('nid')
# Add in rows for the root (geohash_8) records before grouping
root_rows = input_df.selectExpr("id", "lat", "long", "geohash_8", "geohash_8 as neighbor", "lat as nlat", "long as nlong")
df = df.unionAll(root_rows)
# Group by to roll the rows back up but now associating the lat/lon w/ the neighbors
df = df.selectExpr("id", "lat", "long", "geohash_8", "struct(neighbor, nlat, nlong) as neighbors")
df = df.groupBy("id", "lat", "long", "geohash_8").agg(collect_set("neighbors").alias("neighbors"))
# You now have all the data you need in one field, so you can write a python udf to do the combinations
def compute_distance(left_lat, left_lon, right_lat, right_lon):
return 10.0
def combinations(neighbors):
result = []
for left, right in itertools.combinations(neighbors, 2):
dist = compute_distance(left['nlat'], left['nlong'], right['nlat'], right['nlong'])
result.append(Row(left=left['neighbor'], right=right['neighbor'], dist=dist))
return result
udf_schema = ArrayType(StructType([
StructField("left", StringType()),
StructField("right", StringType()),
StructField("dist", FloatType())
]))
combinations_udf = udf(combinations, udf_schema)
# Finally, apply the UDF
df = df.withColumn('neighbors', combinations_udf(col('neighbors')))
df.printSchema()
df.show()
Which produces this:
root
|-- id: string (nullable = true)
|-- lat: float (nullable = true)
|-- long: float (nullable = true)
|-- geohash_8: string (nullable = true)
|-- neighbors: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- neighbor: string (nullable = true)
| | |-- nlat: float (nullable = true)
| | |-- nlong: float (nullable = true)
+---+----+----+---------+------------------------------------------------------------------------------------+
|id |lat |long|geohash_8|neighbors |
+---+----+----+---------+------------------------------------------------------------------------------------+
|0 |10.0|11.0|A |[[D, C, 10.0], [D, A, 10.0], [D, B, 10.0], [C, A, 10.0], [C, B, 10.0], [A, B, 10.0]]|
|2 |14.0|15.0|C |[] |
|1 |12.0|13.0|B |[[D, B, 10.0]] |
|3 |16.0|17.0|D |[] |
+---+----+----+---------+------------------------------------------------------------------------------------+

Resources