PySpark window with condition - apache-spark

I have a dataset with application logs that show when a certain app was launched or closed. Sometimes, the related events may be missing entirely from the logs. I want to match each app start with the related end event (if it exists).
Here's an illustrative dataset:
import pyspark.sql.functions as F
from pyspark.sql import Window
df = spark.createDataFrame([['Group1', 'Logon', 'Name1', '2021-02-05T19:03:00.000+0000'],
['Group1', 'Start', 'Name1', '2021-02-05T19:04:00.000+0000'],
['Group1', 'Start', 'Name1', '2021-02-05T19:05:00.000+0000'],
['Group1', 'End', 'Name1', '2021-02-05T19:06:00.000+0000'],
['Group1', 'End', 'Name3', '2021-02-05T19:06:01.000+0000'],
['Group1', 'End', 'Name1', '2021-02-05T19:07:00.000+0000'],
['Group2', 'Start', 'Name1', '2021-02-05T19:04:00.000+0000'],
['Group2', 'Start', 'Name1', '2021-02-05T19:05:00.000+0000'],
['Group2', 'Start', 'Name2', '2021-02-05T19:06:00.000+0000'],
['Group2', 'End', 'Name1', '2021-02-05T19:07:00.000+0000'],
['Group2', 'Close', 'Name1', '2021-02-05T19:07:00.000+0000'],
], ['group', 'type', 'name', 'time'])
df = df.withColumn('time', F.col('time').cast('timestamp'))
For each group separately, I want to put a common identifier to each 'Start' and 'End' event if they have the same 'name'. In other words, for each 'Start' event I want to find the first 'End' event that has not already been matched to another 'Start' event.
The expected result could be something like the following picture:
I don't mind if the identifier (i.e. 'my_group') is an ID, a timestamp or if it is monotonically increasing across groups. I just want to be able to match the relevant events within each group.
What I've tried
I thought about using window functions in order to identify the end time of 'Start' events and the start time of 'End' events. However, I cannot restrict to searching only for 'End' events (and 'Start' events respectively). Also, I cannot apply the logic described above of finding the first 'End' event that has not already been matched to another 'Start' event.
Here's my code:
app_session_window_down = Window.partitionBy('group', "name").orderBy(F.col("time").cast('long')).rangeBetween(1, Window.unboundedFollowing) #search in the future
app_session_window_up = Window.partitionBy('group', "name").orderBy(F.col("time").cast('long')).rangeBetween(Window.unboundedPreceding, -1) #search in the past
df = df.withColumn("app_time_end", F.when((F.col("type") == 'Start'), F.first(F.col('time'), ignorenulls=True).over(app_session_window_down)).otherwise(F.lit('None')))\
.withColumn("app_time_start", F.when((F.col("type") == 'End'), F.last(F.col('time'), ignorenulls=True).over(app_session_window_up)).otherwise(F.col('app_time_end')))
which gives:
This is nowhere close to what I want to achieve. Any hints?

Explanations are in the inline comments:
from pyspark.sql import functions as F, Window
df2 = df.withColumn(
'my_group', # the column you wanted
F.when(
F.col('type').isin(['Start', 'End']),
F.row_number().over(Window.partitionBy('group', 'name', 'type').orderBy('time'))
)
).withColumn(
'max_group', # helper column: get maximum row_number for each group ; will be used later
F.least(
F.max(
F.when(
F.col('type') == 'Start', F.col('my_group')
).otherwise(0)
).over(Window.partitionBy('group', 'name')),
F.max(
F.when(
F.col('type') == 'End', F.col('my_group')
).otherwise(0)
).over(Window.partitionBy('group', 'name'))
)
).withColumn(
'my_group', # mask the rows which don't have corresponding 'start'/'end'
F.when(
F.col('my_group') <= F.col('max_group'),
F.col('my_group')
)
).withColumn(
'my_group', # add the group name
F.when(F.col('my_group').isNotNull(), F.concat_ws('_', 'group', 'name', 'my_group'))
).drop('max_group').orderBy('group', 'time')
df2.show()
+------+-----+-----+-------------------+--------------+
| group| type| name| time| my_group|
+------+-----+-----+-------------------+--------------+
|Group1|Logon|Name1|2021-02-05 19:03:00| null|
|Group1|Start|Name1|2021-02-05 19:04:00|Group1_Name1_1|
|Group1|Start|Name1|2021-02-05 19:05:00|Group1_Name1_2|
|Group1| End|Name1|2021-02-05 19:06:00|Group1_Name1_1|
|Group1| End|Name3|2021-02-05 19:06:01| null|
|Group1| End|Name1|2021-02-05 19:07:00|Group1_Name1_2|
|Group2|Start|Name1|2021-02-05 19:04:00|Group2_Name1_1|
|Group2|Start|Name1|2021-02-05 19:05:00| null|
|Group2|Start|Name2|2021-02-05 19:06:00| null|
|Group2| End|Name1|2021-02-05 19:07:00|Group2_Name1_1|
|Group2|Close|Name1|2021-02-05 19:07:00| null|
+------+-----+-----+-------------------+--------------+

Related

avoid repeating column condition

lets assume I have the following df -
students = spark.createDataFrame(
[
("amit",),
("amit",),
("itay",),
],
["student"],
)
I want to create a lot of columns based on the value in student column.
I know for sure that I might have just 2 values on this data frame.
example:
students = students.withColumn(
"address", f.when(f.col("student") == "amit", f.lit("berlin")).otherwise(f.lit("paris"))
).withColumn(
"studies", f.when(f.col("student") == "amit", f.lit("CS")).otherwise(f.lit("physics"))
).withColumn(
"age", f.when(f.col("student") == "amit", f.lit("25")).otherwise(f.lit("27"))
)
can I do it cleaner without repeating all the time f.when(f.col("student") == "amit" or to create this columns together? any suggestions can be good.
You could create a list of 3-tuples with all the information that's necessary to create your columns:
values = [
("address", "berlin", "paris"),
("studies", "CS", "physics"),
("age", "25", "27")
]
Then, you can create your spark columns by iterating over values:
cols = [
f.when(f.col('student') == "amit", f.lit(val1))
.otherwise(f.lit(val2)).alias(col_name)
for (col_name, val1, val2) in values
]
students.select("*", *cols).show()
+-------+-------+-------+---+
|student|address|studies|age|
+-------+-------+-------+---+
| amit| berlin| CS| 25|
| amit| berlin| CS| 25|
| itay| paris|physics| 27|
+-------+-------+-------+---+

How to make a function with dynamic variables to add rows in pandas?

I'm trying to make a table from a list of data using pandas.
Originally I wanted to make a function where I can pass dynamic variables so I could continuously add new rows from data list.
It works up until a point where adding rows part begun. Column headers are adding, but the data - no. It either keeps value at only last col or adds nothing.
My scrath was:
for title in titles:
for x in data:
table = {
title: data[x]
}
df.DataFrame(table, columns=titles, index[0]
columns list:
titles = ['timestamp', 'source', 'tracepoint']
data list:
data = ['first', 'second', 'third',
'first', 'second', 'third',
'first', 'second', 'third']
How can I make something like this?
timestamp, source, tracepoint
first, second, third
first, second, third
first, second, third
If you just want to initialize pandas dataframe, you can use dataframe’s constructor.
And you can also append row by using a dict.
Pandas provides other useful functions,
such as concatenation between data frames, insert/delete columns. If you need, please check pandas’s doc.
import pandas as pd
# initialization by dataframe’s constructor
titles = ['timestamp', 'source', 'tracepoint']
data = [['first', 'second', 'third'],
['first', 'second', 'third'],
['first', 'second', 'third']]
df = pd.DataFrame(data, columns=titles)
print('---initialization---')
print(df)
# append row
new_row = {
'timestamp': '2020/11/01',
'source': 'xxx',
'tracepoint': 'yyy'
}
df = df.append(new_row, ignore_index=True)
print('---append result---')
print(df)
output
---initialization---
timestamp source tracepoint
0 first second third
1 first second third
2 first second third
---append result---
timestamp source tracepoint
0 first second third
1 first second third
2 first second third
3 2020/11/01 xxx yyy

How to parse and explode a list of dictionaries stored as string in pyspark?

I have some data that is stored in CSV. Sample data is available here - https://github.com/PranayMehta/apache-spark/blob/master/data.csv
I read the data using pyspark
df = spark.read.csv("data.csv",header=True)
df.printSchema()
root
|-- freeform_text: string (nullable = true)
|-- entity_object: string (nullable = true)
>>> df.show(truncate=False)

|freeform_text |entity_object |

|Grapes are good. Bananas are bad.|[{'name': 'Grapes', 'type': 'OTHER', 'salience': '0.8335162997245789', 'sentiment_score': '0.8999999761581421', 'sentiment_magnitude': '0.8999999761581421', 'metadata': {}, 'mentions': {'mention_text': 'Grapes', 'mention_type': 'COMMON'}}, {'name': 'Bananas', 'type': 'OTHER', 'salience': '0.16648370027542114', 'sentiment_score': '-0.8999999761581421', 'sentiment_magnitude': '0.8999999761581421', 'metadata': {}, 'mentions': {'mention_text': 'Bananas', 'mention_type': 'COMMON'}}]|
|the weather is not good today |[{'name': 'weather', 'type': 'OTHER', 'salience': '1.0', 'sentiment_score': '-0.800000011920929', 'sentiment_magnitude': '0.800000011920929', 'metadata': {}, 'mentions': {'mention_text': 'weather', 'mention_type': 'COMMON'}}] |

Now, I want to explode and parse the fields in the entity_object column in this dataframe. Here is some more know-how on what this column contains -
For every freeform_text stored in the Spark Dataframe, I have written some logic to parse out the entities using google's natural language API. These entities are stores as LIST of DICTIONARIES when I do the computation using pandas. I then convert them to string before storing them to Database.
This CSV is what I read in spark dataframe as 2 columns - freeform_text and entity_object.
The entity_object column as string is actually a LIST of dictionaries. It can be imagined as LIST[ DICT1, DICT2 ] and so on. So, some entity_object rows may have 1 element others may have more than 1 based on the number of entities in the output. For instance in the first row, there are 2 entities - grapes and bananas, whereas in 2nd row there is only entity weather.
I want to explode this entity_object column so that 1 record of freeform_text can be exploded in multiple records.
Here is a screenshot of how I would like my output to be -
This can be a working solution for you - Please do let me if this does not work -
Create the Dataframe here
df_new=spark.createDataFrame([
{
str({'name':'Grapes','type':'OTHER','salience':'0.8335162997245789','sentiment_score':'0.8999999761581421','sentiment_magnitude':'0.8999999761581421','metadata':{},'mentions':{'mention_text':'Grapes','mention_type':'COMMON'}}),
str(
{'name':'weather','type':'OTHER','salience':'1.0','sentiment_score':'-0.800000011920929','sentiment_magnitude':'0.800000011920929','metadata':{},'mentions':{'mention_text':'weather','mention_type':'COMMON'}}
)
},
{
str(
{'name':'banana','type':'OTHER','salience':'1.0','sentiment_score':'-0.800000011920929','sentiment_magnitude':'0.800000011920929','metadata':{},'mentions':{'mention_text':'weather','mention_type':'COMMON'}}
)
}
],T.StringType())
Logic Here
df = df_new.withColumn('col', F.from_json("value", T.ArrayType(T.StringType())))
df = df.withColumn('explode_col', F.explode("col"))
df = df.withColumn('col', F.from_json("explode_col", T.MapType(T.StringType(), T.StringType())))
df = df.withColumn("name", df.col.getItem("name")).withColumn("type", df.col.getItem("type")).withColumn("salience", df.col.getItem("salience")).withColumn("sentiment_score", df.col.getItem("sentiment_score")).withColumn("sentiment_magnitude", df.col.getItem("sentiment_magnitude")).withColumn("mentions", df.col.getItem("mentions"))
df.select("name", "type","salience","sentiment_score","sentiment_magnitude","mentions").show(truncate=False)
Output
+-------+-----+------------------+------------------+-------------------+--------------------------------------------------+
|name |type |salience |sentiment_score |sentiment_magnitude|mentions |
+-------+-----+------------------+------------------+-------------------+--------------------------------------------------+
|weather|OTHER|1.0 |-0.800000011920929|0.800000011920929 |{"mention_text":"weather","mention_type":"COMMON"}|
|Grapes |OTHER|0.8335162997245789|0.8999999761581421|0.8999999761581421 |{"mention_text":"Grapes","mention_type":"COMMON"} |
|banana |OTHER|1.0 |-0.800000011920929|0.800000011920929 |{"mention_text":"weather","mention_type":"COMMON"}|
+-------+-----+------------------+------------------+-------------------+--------------------------------------------------+
Update - Instead of createDataFrame - use spark.read.csv() as below
df_new = spark.read.csv("/FileStore/tables/data.csv", header=True)
df_new.show(truncate=False)
# Logic Here
df = df_new.withColumn('col', F.from_json("entity_object", T.ArrayType(T.StringType())))
df = df.withColumn('explode_col', F.explode("col"))
df = df.withColumn('col', F.from_json("explode_col", T.MapType(T.StringType(), T.StringType())))
df = df.withColumn("name", df.col.getItem("name")).withColumn("type", df.col.getItem("type")).withColumn("salience", df.col.getItem("salience")).withColumn("sentiment_score", df.col.getItem("sentiment_score")).withColumn("sentiment_magnitude", df.col.getItem("sentiment_magnitude")).withColumn("mentions", df.col.getItem("mentions"))
df.select("freeform_text", "name", "type","salience","sentiment_score","sentiment_magnitude","mentions").show(truncate=False)

+---------------------------------+-------+-----+-------------------+-------------------+-------------------+--------------------------------------------------+
|freeform_text |name |type |salience |sentiment_score |sentiment_magnitude|mentions |
+---------------------------------+-------+-----+-------------------+-------------------+-------------------+--------------------------------------------------+
|Grapes are good. Bananas are bad.|Grapes |OTHER|0.8335162997245789 |0.8999999761581421 |0.8999999761581421 |{"mention_text":"Grapes","mention_type":"COMMON"} |
|Grapes are good. Bananas are bad.|Bananas|OTHER|0.16648370027542114|-0.8999999761581421|0.8999999761581421 |{"mention_text":"Bananas","mention_type":"COMMON"}|
|the weather is not good today |weather|OTHER|1.0 |-0.800000011920929 |0.800000011920929 |{"mention_text":"weather","mention_type":"COMMON"}|
+---------------------------------+-------+-----+-------------------+-------------------+-------------------+--------------------------------------------------+

How to Left Join in Presto SQL?

Can't for the life of me figure out a simple left join in Presto, even after reading the documentation. I'm very familiar with Postgres and tested my query there to make sure there wasn't a glaring error on my part. Please reference code below:
select * from
(select cast(order_date as date),
count(distinct(source_order_id)) as prim_orders,
sum(quantity) as prim_tickets,
sum(sale_amount) as prim_revenue
from table_a
where order_date >= date '2018-01-01'
group by 1)
left join
(select summary_date,
sum(impressions) as sem_impressions,
sum(clicks) as sem_clicks,
sum(spend) as sem_spend,
sum(total_orders) as sem_orders,
sum(total_tickets) as sem_tickets,
sum(total_revenue) as sem_revenue
from table_b
where site like '%SEM%'
and summary_date >= date '2018-01-01'
group by 1) as b
on a.order_date = b.summary_date
Running that gives the following error
SQL Error: Failed to run query
Failed to run query
line 1:1: mismatched input 'on' expecting {'(', 'SELECT', 'DESC', 'WITH',
'VALUES', 'CREATE', 'TABLE', 'INSERT', 'DELETE', 'DESCRIBE', 'GRANT',
'REVOKE', 'EXPLAIN', 'SHOW', 'USE', 'DROP', 'ALTER', 'SET', 'RESET', 'START', 'COMMIT', 'ROLLBACK', 'CALL', 'PREPARE', 'DEALLOCATE', 'EXECUTE'} (Service: AmazonAthena; Status Code: 400; Error Code: InvalidRequestException; Request ID: a33a6671-07a2-4d7b-bb75-f70f7b82409e)
line 1:1: mismatched input 'on' expecting {'(', 'SELECT', 'DESC', 'WITH', 'VALUES', 'CREATE', 'TABLE', 'INSERT', 'DELETE', 'DESCRIBE', 'GRANT', 'REVOKE', 'EXPLAIN', 'SHOW', 'USE', 'DROP', 'ALTER', 'SET', 'RESET', 'START', 'COMMIT', 'ROLLBACK', 'CALL', 'PREPARE', 'DEALLOCATE', 'EXECUTE'} (Service: AmazonAthena; Status Code: 400; Error Code: InvalidRequestException; Request ID: a33a6671-07a2-4d7b-bb75-f70f7b82409e)
The first problem I notice is that your join clause assumes the first sub-query is aliased as a, but it is not aliased at all. I recommend aliasing that table to see if that fixes it (I also recommend aliasing the order_date column explicitly outside of the cast() statement since you are joining on that column).
Try this:
select * from
(select cast(order_date as date) as order_date,
count(distinct(source_order_id)) as prim_orders,
sum(quantity) as prim_tickets,
sum(sale_amount) as prim_revenue
from table_a
where order_date >= date '2018-01-01'
group by 1) as a
left join
(select summary_date,
sum(impressions) as sem_impressions,
sum(clicks) as sem_clicks,
sum(spend) as sem_spend,
sum(total_orders) as sem_orders,
sum(total_tickets) as sem_tickets,
sum(total_revenue) as sem_revenue
from table_b
where site like '%SEM%'
and summary_date >= date '2018-01-01'
group by 1) as b
on a.order_date = b.summary_date
One option is to declare your subqueries by using with:
with a as
(select cast(order_date as date),
count(distinct(source_order_id)) as prim_orders,
sum(quantity) as prim_tickets,
sum(sale_amount) as prim_revenue
from table_a
where order_date >= date '2018-01-01'
group by 1),
b as
(select summary_date,
sum(impressions) as sem_impressions,
sum(clicks) as sem_clicks,
sum(spend) as sem_spend,
sum(total_orders) as sem_orders,
sum(total_tickets) as sem_tickets,
sum(total_revenue) as sem_revenue
from table_b
where site like '%SEM%'
and summary_date >= date '2018-01-01'
group by 1)
select * from a
left join b
on a.order_date = b.summary_date;

Pyspark DataFrame: find difference between two DataFrames (values and column names)

I am having total 100+ columns in dataframe.
I am trying to compare two data frame and find unmatched record with column name.
I got a output bellow code but When I run the code for 100+ columns job got aborted.
I am doing this for SCD Type 2 delta process error finding.
from pyspark.sql.types import *
from pyspark.sql.functions import *
d2 = sc.parallelize([("A1", 500,1005) ,("A2", 700,10007)])
dataFrame1 = sqlContext.createDataFrame(d2, ["ID", "VALUE1", "VALUE2"])
d2 = sc.parallelize([("A1", 600,1005),("A2", 700,10007)])
dataFrame2 = sqlContext.createDataFrame(d2, ["ID", "VALUE1", "VALUE2"])
key_id_col_name="ID"
key_id_value="A1"
dataFrame1.select("ID","VALUE1").subtract(dataFrame2.select("ID",col("VALUE1").alias("value"))).show()
def unequalColumnValuesTwoDF(dataFrame1,dataFrame2,key_id_col_name,key_id_value):
chk_fst=True
dataFrame1 = dataFrame1.where(dataFrame1[key_id_col_name] == key_id_value)
dataFrame2 = dataFrame2.where(dataFrame2[key_id_col_name] == key_id_value)
col_names = list(set(dataFrame1.columns).intersection(dataFrame2.columns))
col_names.remove(key_id_col_name)
for col_name in col_names:
if chk_fst == True:
df_tmp = dataFrame1.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE")).subtract(dataFrame2.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE"))).withColumn("COL_NAME",lit(col_name))
chk_fst = False
else:
df_tmp = df_tmp.unionAll(dataFrame1.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE")).subtract(dataFrame2.select(col(key_id_col_name).alias("KEY_ID"),col(col_name).alias("VALUE"))).withColumn("COL_NAME",lit(col_name)))
return df_tmp
res_df = unequalColumnValuesTwoDF(dataFrame1,dataFrame2,key_id_col_name,key_id_value)
res_df.show()
>>> dataFrame1.show()
+---+------+------+
| ID|VALUE1|VALUE2|
+---+------+------+
| A1| 500| 1005|
| A2| 700| 10007|
+---+------+------+
>>> dataFrame2.show()
+---+------+------+
| ID|VALUE1|VALUE2|
+---+------+------+
| A1| 600| 1005|
| A2| 700| 10007|
+---+------+------+
>>> res_df.show()
+------+-----+--------+
|KEY_ID|VALUE|COL_NAME|
+------+-----+--------+
| A1| 500| VALUE1|
+------+-----+--------+
Please suggest any other way.
Here is another approach:
Join the two DataFrames using the ID column.
Then for each row, create a new column which contains the columns for which there is a difference.
Create this new column as a key-value pair map using pyspark.sql.functions.create_map().1
The key for the map will be the column name.
Using pyspark.sql.functions.when(), set the value to the corresponding value in in dataFrame1 (as it seems like that is what you want from your example) if there is a difference between the two DataFrames. Otherwise, we set the value to None.
Use pyspark.sql.functions.explode() on the map column, and filter out any rows where the difference is not null using pyspark.sql.functions.isnull().
Select the columns you want and rename using alias().
Example:
import pyspark.sql.functions as f
columns = [c for c in dataFrame1.columns if c != 'ID']
dataFrame1.alias('r').join(dataFrame2.alias('l'), on='ID')\
.withColumn(
'diffs',
f.create_map(
*reduce(
list.__add__,
[
[
f.lit(c),
f.when(
f.col('r.'+c) != f.col('l.'+c),
f.col('r.'+c)
).otherwise(None)
]
for c in columns
]
)
)
)\
.select([f.col('ID'), f.explode('diffs')])\
.where(~f.isnull(f.col('value')))\
.select(
f.col('ID').alias('KEY_ID'),
f.col('value').alias('VALUE'),
f.col('key').alias('COL_NAME')
)\
.show(truncate=False)
#+------+-----+--------+
#|KEY_ID|VALUE|COL_NAME|
#+------+-----+--------+
#|A1 |500 |VALUE1 |
#+------+-----+--------+
Notes
1 The syntax *reduce(list.__add__, [[f.lit(c), ...] for c in columns]) as the argument to create_map() is some python-fu that helps create the map dynamically.
create_map() expects an even number of arguments- it assumes that the first argument in every pair is the key and the second is the value. In order to put the arguments in that order, the list comprehension yields a list for each iteration. We reduce this list of lists into a flat list using list.__add__.
Finally the * operator is used to unpack the list.
Here is the intermediate output, which may make the logic clearer:
dataFrame1.alias('r').join(dataFrame2.alias('l'), on='ID')\
.withColumn(
'diffs',
f.create_map(
*reduce(
list.__add__,
[
[
f.lit(c),
f.when(
f.col('r.'+c) != f.col('l.'+c),
f.col('r.'+c)
).otherwise(None)
]
for c in columns
]
)
)
)\
.select('ID', 'diffs').show(truncate=False)
#+---+-----------------------------------+
#|ID |diffs |
#+---+-----------------------------------+
#|A2 |Map(VALUE1 -> null, VALUE2 -> null)|
#|A1 |Map(VALUE1 -> 500, VALUE2 -> null) |
#+---+-----------------------------------+

Resources