Join two big tables and to get most recent value - apache-spark

I want to join two Spark dataframes that have millions of rows. Assume 'id' is the common column to both dataframes. Both also have 'date' column. However, the date in the two tables may not match. If a record in the first table does not have a matching date in the second table, for the 'value' column from the second table, the most recent observation should be taken. Therefore, I cannot join on 'id' and 'date'. I have created a sample dataframes below. What is optimal way to perform this given that the data size is huge?
import pandas as pd
a = pd.DataFrame({'id':[1,2,3,1,2,3,1,2,3, 1,2,3], 'date': ['2020-01-01', '2020-01-01', '2020-01-01', '2020-01-08', '2020-01-08', '2020-01-08', '2020-01-21', '2020-01-21', '2020-01-21', '2020-01-31', '2020-01-31', '2020-01-31']})
a = spark.createDataFrame(a)
b = pd.DataFrame({'id':[1,2,3,1,2,1,3,1,2], 'date': ['2019-12-25', '2019-12-25', '2019-12-25', '2020-01-08', '2020-01-08', '2020-01-21', '2020-01-21', '2020-01-31', '2020-01-31'], 'value': [0.1,0.2,0.3,1,2,10,30,0.1,0.2]})
b = spark.createDataFrame(b)
required_result = pd.DataFrame({'id':[1,2,3,1,2,3,1,2,3, 1,2,3], 'date': ['2020-01-01', '2020-01-01', '2020-01-01', '2020-01-08', '2020-01-08', '2020-01-08', '2020-01-21', '2020-01-21', '2020-01-21', '2020-01-31', '2020-01-31', '2020-01-31'],
'value': [0.1,0.2,0.3, 1,2,0.3,10, 2,30,0.1,0.2, 30]})

You could join on id and keep dates from the second dataframe which are equal to or lower than first dataframe's dates.
data1_sdf.join(data2_sdf.withColumnRenamed('date', 'date_b'),
[data1_sdf.id == data2_sdf.id,
data1_sdf.date >= func.col('date_b')],
'left'
). \
drop(data2_sdf.id). \
withColumn('dates_diff', func.datediff('date_b', 'date')). \
withColumn('max_dtdiff',
func.max('dates_diff').over(wd.partitionBy('id', 'date'))
). \
filter(func.col('max_dtdiff') == func.col('dates_diff')). \
drop('dates_diff', 'max_dtdiff'). \
orderBy('id', 'date'). \
show()
# +---+----------+----------+-----+
# | id| date| date_b|value|
# +---+----------+----------+-----+
# | 1|2020-01-01|2019-12-25| 0.1|
# | 1|2020-01-08|2020-01-08| 1.0|
# | 1|2020-01-21|2020-01-21| 10.0|
# | 1|2020-01-31|2020-01-31| 0.1|
# | 2|2020-01-01|2019-12-25| 0.2|
# | 2|2020-01-08|2020-01-08| 2.0|
# | 2|2020-01-21|2020-01-08| 2.0|
# | 2|2020-01-31|2020-01-31| 0.2|
# | 3|2020-01-01|2019-12-25| 0.3|
# | 3|2020-01-08|2019-12-25| 0.3|
# | 3|2020-01-21|2020-01-21| 30.0|
# | 3|2020-01-31|2020-01-21| 30.0|
# +---+----------+----------+-----+

It seems, that you can join just on id, as this key looks well distributed. You could aggregate a bit the df b, join both dfs, then filter and extract the value with max date.
from pyspark.sql import functions as F
b = b.groupBy('id').agg(F.collect_list(F.array('date', 'value')).alias('dv'))
df = a.join(b, 'id', 'left')
df = df.select(
a['*'],
F.array_max(F.filter('dv', lambda x: x[0] <= F.col('date')))[1].alias('value')
)
df.show()
# +---+----------+-----+
# | id| date|value|
# +---+----------+-----+
# | 1|2020-01-01| 0.1|
# | 1|2020-01-08| 1.0|
# | 3|2020-01-01| 0.3|
# | 3|2020-01-08| 0.3|
# | 2|2020-01-01| 0.2|
# | 2|2020-01-08| 2.0|
# | 1|2020-01-21| 10.0|
# | 1|2020-01-31| 0.1|
# | 3|2020-01-21| 30.0|
# | 3|2020-01-31| 30.0|
# | 2|2020-01-21| 2.0|
# | 2|2020-01-31| 0.2|
# +---+----------+-----+

Related

Create sequential unique id for each group

I'm trying to find an equivalent for the following snippet (reference) to create unique id to every unique combination from two columns in PySpark.
Pandas approach:
df['my_id'] = df.groupby(['foo', 'bar'], sort=False).ngroup() + 1
I tried the following, but it's creating more ids than required:
df = df.withColumn("my_id", F.row_number().over(Window.orderBy('foo', 'bar')))
Instead of row_number, use dense_rank:
from pyspark.sql import functions as F, Window
df = spark.createDataFrame(
[('r1', 'ph1'),
('r1', 'ph1'),
('r1', 'ph2'),
('s4', 'ph3'),
('s3', 'ph2'),
('s3', 'ph2')],
['foo', 'bar'])
df = df.withColumn("my_id", F.dense_rank().over(Window.orderBy('foo', 'bar')))
df.show()
# +---+---+-----+
# |foo|bar|my_id|
# +---+---+-----+
# | r1|ph1| 1|
# | r1|ph1| 1|
# | r1|ph2| 2|
# | s3|ph2| 3|
# | s3|ph2| 3|
# | s4|ph3| 4|
# +---+---+-----+

How do I replace a full stop with a zero in PySpark?

I am trying to replace a full stop in my raw data with the value 0 in PySpark.
I tried to use a .when and .otherwise statement.
I tried to use regexp_replace to change the '.' to 0.
Code tried:
from pyspark.sql import functions as F
#For #1 above:
dataframe2 = dataframe1.withColumn("test_col", F.when(((F.col("test_col") == F.lit(".")), 0).otherwise(F.col("test_col")))
#For #2 above:
dataframe2 = dataframe1.withColumn('test_col', F.regexp_replace(dataframe1.test_col, '.', 0))
Instead of "." it should rewrite the column with numbers only (i.e. there is a number in non full stop rows, otherwise, it's a full stop that should be replaced with 0).
pyspark version
from pyspark.sql import SparkSession
from pyspark.sql.types import (StringType, IntegerType, StructField, StructType)
from pyspark.sql import functions
column_schema = StructType([StructField("num", IntegerType()), StructField("text", StringType())])
data = [[3, 'r1'], [9, 'r2.'], [27, '.']]
spark = SparkSession.builder.master("local").getOrCreate()
spark.conf.set("spark.executor.memory", '1g')
spark.conf.set('spark.executor.cores', '1')
spark.conf.set('spark.cores.max', '2')
spark.conf.set("spark.driver.memory", '1g')
spark_context = spark.sparkContext
data_frame = spark.createDataFrame(data, schema=column_schema)
data_frame.show()
filtered_data_frame = data_frame.withColumn('num',
functions.when(data_frame['num'] == 3, -3).otherwise(data_frame['num']))
filtered_data_frame.show()
filtered_data_frame = data_frame.withColumn('text',
functions.when(data_frame['text'] == '.', '0').otherwise(
data_frame['text']))
filtered_data_frame.show()
output
+---+----+
|num|text|
+---+----+
| 3| r1|
| 9| r2.|
| 27| .|
+---+----+
+---+----+
|num|text|
+---+----+
| -3| r1|
| 9| r2.|
| 27| .|
+---+----+
+---+----+
|num|text|
+---+----+
| 3| r1|
| 9| r2.|
| 27| 0|
+---+----+
sample code does query properly
package otz.scalaspark
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object ValueReplacement {
def main(args: Array[String]) {
val sparkConfig = new SparkConf().setAppName("Value-Replacement").setMaster("local[*]").set("spark.executor.memory", "1g");
val sparkContext = new SparkContext(sparkConfig)
val someData = Seq(
Row(3, "r1"),
Row(9, "r2"),
Row(27, "r3"),
Row(81, "r4")
)
val someSchema = List(
StructField("number", IntegerType, true),
StructField("word", StringType, true)
)
val sqlContext = new SQLContext(sparkContext)
val dataFrame = sqlContext.createDataFrame(
sparkContext.parallelize(someData),
StructType(someSchema)
)
val filteredDataFrame = dataFrame.withColumn("number", when(col("number") === 3, -3).otherwise(col("number")));
filteredDataFrame.show()
}
}
output
+------+----+
|number|word|
+------+----+
| -3| r1|
| 9| r2|
| 27| r3|
| 81| r4|
+------+----+
You attempt #2 was almost correct, if you have a dataframe1 like:
+--------+
|test_col|
+--------+
| 1.0|
| 2.0|
| 2|
+--------+
Your attempt must be yielding:
dataframe2 = dataframe1.withColumn('test_col', F.regexp_replace(dataframe1.test_col, '.', 0))
dataframe2.show()
+--------+
|test_col|
+--------+
| 000|
| 000|
| 0|
+--------+
Here the . means all the letter are to be replaced and not just '.'.
However, if you add an escape sequence (\) before the dot then things should work fine.
dataframe2 = dataframe1.withColumn('test_col', F.regexp_replace(dataframe1.test_col, '\.', '0'))
dataframe2.show()
+--------+
|test_col|
+--------+
| 100|
| 200|
| 2|
+--------+

How to convert a group and agg code on a dataframe to a udf in PySpark?

I am trying to create a udf for my code for generalizing the problem. I run into issues where it seems like I cannot pass a dataframe to the function.
Input DataFrame:
df = sqlContext.createDataFrame([('1', 201001,3400,1600,65,320,400,), ('1', 201002,5200,1600,65,320,400,), ('1', 201003,65,1550,32,320,400,), ('2', 201505,3200,1800,12,1,40,), ('2', 201508,3200,3200,12,1,40,), ('3', 201412,40,40,12,1,3,)],
['ColA', 'Col1','Col2','Col3','Col4','Col5','Col6',])
+----+------+----+----+----+----+----+
|ColA| Col1|Col2|Col3|Col4|Col5|Col6|
+----+------+----+----+----+----+----+
| 1|201001|3400|1600| 65| 320| 400|
| 1|201002|5200|1600| 65| 320| 400|
| 1|201003| 65|1550| 32| 320| 400|
| 2|201505|3200|1800| 12| 1| 40|
| 2|201508|3200|3200| 12| 1| 40|
| 3|201412| 40| 40| 12| 1| 3|
+----+------+----+----+----+----+----+
Expected Ouput:
df = sqlContext.createDataFrame([(1,['201001', '201002', '201003'],[3400, 5200, 65],[1600, 1600, 1550],[65,32],[320],[400],), (2,['201505', '201508'],[3200, 3200],[1800, 3200],[12],[1],[40],),
(3,['201412'],[40],[40],[12],[1],[3],)], ['ColA', 'Col1','Col2','Col3','Col4','Col5','Col6',])
df.show()
+----+--------------------+----------------+------------------+--------+-----+-----+
|ColA| Col1| Col2| Col3| Col4| Col5| Col6|
+----+--------------------+----------------+------------------+--------+-----+-----+
| 1|[201001, 201002, ...|[3400, 5200, 65]|[1600, 1600, 1550]|[65, 32]|[320]|[400]|
| 2| [201505, 201508]| [3200, 3200]| [1800, 3200]| [12]| [1]| [40]|
| 3| [201412]| [40]| [40]| [12]| [1]| [3]|
+----+--------------------+----------------+------------------+--------+-----+-----+
This is the code that works (non-functional)
groupBy = ['ColA']
convert_to_list = ['Col1', 'Col2', 'Col3',]
convert_to_set = ['Col4', 'Col5', 'Col6',]
exprs = [F.collect_set(F.col(c)).alias(c) for c in cols_to_list]\
+ [F.collect_set(F.col(c)).alias(c) in funs_set for c in
df = df.groupby(*groupBy).agg(*exprs)
When I try to create a udf, I get this error:
#F.udf
def aggregation(df, groupby_column, cols_to_list, cols_to_set):
exprs = [F.collect_set(F.col(c)).alias(c) for c in cols_to_list]\
+ [F.collect_set(F.col(c)).alias(c) in funs_set for c in cols_to_set]
return df.groupby(*groupby_column).agg(*exprs)
groupby_column = ['ColA']
cols_to_list = ['Col1', 'Col2', 'Col3',]
cols_to_set = ['Col4', 'Col5', 'Col6',]
exprs = F.concat([f(F.col(c)) for f in fun_list for c in convert_to_list], [f(F.col(c)) for f in funs_set for c in convert_to_set])
df = df.groupby(*groupBy).agg(*exprs)
TypeError: Invalid argument, not a string or column: DataFrame

Spark: Find the value with the highest occurrence per group over rolling time window

Starting from the following spark data frame:
from io import StringIO
import pandas as pd
from pyspark.sql.functions import col
pd_df = pd.read_csv(StringIO("""device_id,read_date,id,count
device_A,2017-08-05,4041,3
device_A,2017-08-06,4041,3
device_A,2017-08-07,4041,4
device_A,2017-08-08,4041,3
device_A,2017-08-09,4041,3
device_A,2017-08-10,4041,1
device_A,2017-08-10,4045,2
device_A,2017-08-11,4045,3
device_A,2017-08-12,4045,3
device_A,2017-08-13,4045,3"""),infer_datetime_format=True, parse_dates=['read_date'])
df = spark.createDataFrame(pd_df).withColumn('read_date', col('read_date').cast('date'))
df.show()
Output:
+--------------+----------+----+-----+
|device_id | read_date| id|count|
+--------------+----------+----+-----+
| device_A|2017-08-05|4041| 3|
| device_A|2017-08-06|4041| 3|
| device_A|2017-08-07|4041| 4|
| device_A|2017-08-08|4041| 3|
| device_A|2017-08-09|4041| 3|
| device_A|2017-08-10|4041| 1|
| device_A|2017-08-10|4045| 2|
| device_A|2017-08-11|4045| 3|
| device_A|2017-08-12|4045| 3|
| device_A|2017-08-13|4045| 3|
+--------------+----------+----+-----+
I would like to find the most frequent id for each (device_id, read_date) combination, over a 3 day rolling window. For each group of rows selected by the time window, I need to find the most frequent id by summing up the counts per id, then return the top id.
Expected Output:
+--------------+----------+----+
|device_id | read_date| id|
+--------------+----------+----+
| device_A|2017-08-05|4041|
| device_A|2017-08-06|4041|
| device_A|2017-08-07|4041|
| device_A|2017-08-08|4041|
| device_A|2017-08-09|4041|
| device_A|2017-08-10|4041|
| device_A|2017-08-11|4045|
| device_A|2017-08-12|4045|
| device_A|2017-08-13|4045|
+--------------+----------+----+
I am starting to think this is only possible using a custom aggregation function. Since spark 2.3 is not out I will have to write this in Scala or use collect_list. Am I missing something?
Add window:
from pyspark.sql.functions import window, sum as sum_, date_add
df_w = df.withColumn(
"read_date", window("read_date", "3 days", "1 day")["start"].cast("date")
)
# Then handle the counts
df_w = df_w.groupBy('device_id', 'read_date', 'id').agg(sum_('count').alias('count'))
Use one of the solutions from Find maximum row per group in Spark DataFrame for example
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
rolling_window = 3
top_df = (
df_w
.withColumn(
"rn",
row_number().over(
Window.partitionBy("device_id", "read_date")
.orderBy(col("count").desc())
)
)
.where(col("rn") == 1)
.orderBy("read_date")
.drop("rn")
)
# results are calculated on the start of the time window - adjust read_date as needed
final_df = top_df.withColumn('read_date', date_add('read_date', rolling_window - 1))
final_df.show()
# +---------+----------+----+-----+
# |device_id| read_date| id|count|
# +---------+----------+----+-----+
# | device_A|2017-08-05|4041| 3|
# | device_A|2017-08-06|4041| 6|
# | device_A|2017-08-07|4041| 10|
# | device_A|2017-08-08|4041| 10|
# | device_A|2017-08-09|4041| 10|
# | device_A|2017-08-10|4041| 7|
# | device_A|2017-08-11|4045| 5|
# | device_A|2017-08-12|4045| 8|
# | device_A|2017-08-13|4045| 9|
# | device_A|2017-08-14|4045| 6|
# | device_A|2017-08-15|4045| 3|
# +---------+----------+----+-----+
I managed to find a very inefficient solution. Hopefully someone can spot improvements to avoid the python udf and call to collect_list.
from pyspark.sql import Window
from pyspark.sql.functions import col, collect_list, first, udf
from pyspark.sql.types import IntegerType
def top_id(ids, counts):
c = Counter()
for cnid, count in zip(ids, counts):
c[cnid] += count
return c.most_common(1)[0][0]
rolling_window = 3
days = lambda i: i * 86400
# Define a rolling calculation window based on time
window = (
Window()
.partitionBy("device_id")
.orderBy(col("read_date").cast("timestamp").cast("long"))
.rangeBetween(-days(rolling_window - 1), 0)
)
# Use window and collect_list to store data matching the window definition on each row
df_collected = df.select(
'device_id', 'read_date',
collect_list(col('id')).over(window).alias('ids'),
collect_list(col('count')).over(window).alias('counts')
)
# Get rid of duplicate rows where necessary
df_grouped = df_collected.groupBy('device_id', 'read_date').agg(
first('ids').alias('ids'),
first('counts').alias('counts'),
)
# Register and apply udf to return the most frequently seen id
top_id_udf = udf(top_id, IntegerType())
df_mapped = df_grouped.withColumn('top_id', top_id_udf(col('ids'), col('counts')))
df_mapped.show(truncate=False)
returns:
+---------+----------+------------------------+------------+------+
|device_id|read_date |ids |counts |top_id|
+---------+----------+------------------------+------------+------+
|device_A |2017-08-05|[4041] |[3] |4041 |
|device_A |2017-08-06|[4041, 4041] |[3, 3] |4041 |
|device_A |2017-08-07|[4041, 4041, 4041] |[3, 3, 4] |4041 |
|device_A |2017-08-08|[4041, 4041, 4041] |[3, 4, 3] |4041 |
|device_A |2017-08-09|[4041, 4041, 4041] |[4, 3, 3] |4041 |
|device_A |2017-08-10|[4041, 4041, 4041, 4045]|[3, 3, 1, 2]|4041 |
|device_A |2017-08-11|[4041, 4041, 4045, 4045]|[3, 1, 2, 3]|4045 |
|device_A |2017-08-12|[4041, 4045, 4045, 4045]|[1, 2, 3, 3]|4045 |
|device_A |2017-08-13|[4045, 4045, 4045] |[3, 3, 3] |4045 |
+---------+----------+------------------------+------------+------+

Convert Python dictionary to Spark DataFrame

I have a Python dictionary :
dic = {
(u'aaa',u'bbb',u'ccc'):((0.3, 1.2, 1.3, 1.5), 1.4, 1),
(u'kkk',u'ggg',u'ccc',u'sss'):((0.6, 1.2, 1.7, 1.5), 1.4, 2)
}
I'd like to convert this dictionary to Spark DataFrame with columns :
['key', 'val_1', 'val_2', 'val_3', 'val_4', 'val_5', 'val_6']
example row (1) :
key | val_1 |val_2 | val_3 | val_4 | val_5| val_6|
u'aaa',u'bbb',u'ccc' | 0.3 |1.2 |1.3 |1.5 |1.4 |1 |
Thank you in advance
Extract items, cast key to list and combine everything into a single tuple:
df = sc.parallelize([
(list(k), ) +
v[0] +
v[1:]
for k, v in dic.items()
]).toDF(['key', 'val_1', 'val_2', 'val_3', 'val_4', 'val_5', 'val_6'])
df.show()
## +--------------------+-----+-----+-----+-----+-----+-----+
## | key|val_1|val_2|val_3|val_4|val_5|val_6|
## +--------------------+-----+-----+-----+-----+-----+-----+
## | [aaa, bbb, ccc]| 0.3| 1.2| 1.3| 1.5| 1.4| 1|
## |[kkk, ggg, ccc, sss]| 0.6| 1.2| 1.7| 1.5| 1.4| 2|
## +--------------------+-----+-----+-----+-----+-----+-----+

Resources