Databricks Spark SQL subquery based query throws TreeNodeException - apache-spark

I am running a pretty simple query in databricks notebook which involves a subquery.
select recorddate, count(*)
from( select record_date as recorddate, column1
from table1
where record_date >= date_sub(current_date(), 1)
)t
group by recorddate
order by recorddate
I get the following exception:
Error in SQL statement: package.TreeNodeException: Binding attribute, tree: recorddate
And when remove the order by clause, the query runs fine. I see some posts talking about similar issues but exactly the same. Is this a known behavior? Any workaround/fix for this?

Working well for me, (spark = 2.4.5) I think problem is something different-
val df = spark.sql("select current_date() as record_date, '1' column1")
df.show(false)
/**
* +-----------+-------+
* |record_date|column1|
* +-----------+-------+
* |2020-07-29 |1 |
* +-----------+-------+
*/
df.createOrReplaceTempView("table1")
spark.sql(
"""
|select recorddate, count(*)
|from( select record_date as recorddate, column1
| from table1
| where record_date >= date_sub(current_date(), 1)
| )t
|group by recorddate
|order by recorddate
|
""".stripMargin)
.show(false)
/**
* +----------+--------+
* |recorddate|count(1)|
* +----------+--------+
* |2020-07-29|1 |
* +----------+--------+
*/

Related

Median calculation in spark 1.6 Error: expected but identifier DIV found

I am trying to calculate median on latitude column based on group (destinationid and LocationID) column
Scala Spark 1.6
Data in JSON looks like:
DESTINATION_ID,LOCATION_ID,LATITUDE
[ENSG00000257017,EAST_0000182,0.07092000000000001]
[ENSG00000257017,WEST_0001397,0.07092000000000001]
[ENSG00000181965,EAST_1001951,0.07056000000000001]
[ENSG00000146648,EAST_0000616,0.07092000000000001]
[ENSG00000111537,WEST_0001845,0.07092000000000001]
[ENSG00000103222,EAST_0000565,0.07056000000000001]
[ENSG00000118137,EAST_0000508,0.07092000000000001]
[ENSG00000112715,EAST_0000616,0.07092000000000001]
[ENSG00000108984,EAST_0000574,0.07056000000000001]
[ENSG00000159640,NORTH_797,0.07092000000000001]
[ENSG00000113522,NORTH_790,0.07056000000000001]
[ENSG00000133895,NORTH_562,0.07056000000000001]
Code
var ds = sqlContext.sql("""
SELECT DESTINATION_ID,LOCATION_ID, avg(LATITUDE) as median
FROM ( SELECT DESTINATION_ID,LOCATION_ID, LATITUDE, rN, (CASE WHEN cN % 2 = 0 then (cN DIV 2) ELSE (cN DIV 2) + 1 end) as m1, (cN DIV 2) + 1 as m2
FROM (
SELECT DESTINATION_ID,LOCATION_ID, LATITUDE, row_number() OVER (PARTITION BY DESTINATION_ID,LOCATION_ID ORDER BY LATITUDE ) as rN,
count(LATITUDE) OVER (PARTITION BY DESTINATION_ID,LOCATION_ID ) as cN
FROM people
) s
) r
WHERE rN BETWEEN m1 and m2
GROUP BY DESTINATION_ID,LOCATION_ID
""")
Error:
**Exception in thread "main" java.lang.RuntimeException: [3.98] failure: ``)''
expected but identifier DIV found**
Please help me if i am missing something.
Or
Techie please guide me Is there any better way to calculate median in spark
Thanks
I tried to execute the above query with the test input you provided, as below-
val data =
"""
|DESTINATION_ID,LOCATION_ID,LATITUDE
|ENSG00000257017,EAST_0000182,0.07092000000000001
|ENSG00000257017,WEST_0001397,0.07092000000000001
|ENSG00000181965,EAST_1001951,0.07056000000000001
|ENSG00000146648,EAST_0000616,0.07092000000000001
|ENSG00000111537,WEST_0001845,0.07092000000000001
|ENSG00000103222,EAST_0000565,0.07056000000000001
|ENSG00000118137,EAST_0000508,0.07092000000000001
|ENSG00000112715,EAST_0000616,0.07092000000000001
|ENSG00000108984,EAST_0000574,0.07056000000000001
|ENSG00000159640,NORTH_797,0.07092000000000001
|ENSG00000113522,NORTH_790,0.07056000000000001
""".stripMargin
val stringDS = data.split(System.lineSeparator())
.map(_.split("\\,").map(_.replaceAll("""^[ \t]+|[ \t]+$""", "")).mkString(","))
.toSeq.toDS()
val df = spark.read
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.option("nullValue", "null")
.csv(stringDS)
df.show(false)
df.printSchema()
/**
* +---------------+------------+-------------------+
* |DESTINATION_ID |LOCATION_ID |LATITUDE |
* +---------------+------------+-------------------+
* |ENSG00000257017|EAST_0000182|0.07092000000000001|
* |ENSG00000257017|WEST_0001397|0.07092000000000001|
* |ENSG00000181965|EAST_1001951|0.07056000000000001|
* |ENSG00000146648|EAST_0000616|0.07092000000000001|
* |ENSG00000111537|WEST_0001845|0.07092000000000001|
* |ENSG00000103222|EAST_0000565|0.07056000000000001|
* |ENSG00000118137|EAST_0000508|0.07092000000000001|
* |ENSG00000112715|EAST_0000616|0.07092000000000001|
* |ENSG00000108984|EAST_0000574|0.07056000000000001|
* |ENSG00000159640|NORTH_797 |0.07092000000000001|
* |ENSG00000113522|NORTH_790 |0.07056000000000001|
* +---------------+------------+-------------------+
*
* root
* |-- DESTINATION_ID: string (nullable = true)
* |-- LOCATION_ID: string (nullable = true)
* |-- LATITUDE: double (nullable = true)
*/
df.createOrReplaceTempView("people")
spark.sql(
"""
|SELECT
| DESTINATION_ID,
| LOCATION_ID,
| avg(LATITUDE) as median
|FROM
| (
| SELECT
| DESTINATION_ID,
| LOCATION_ID,
| LATITUDE,
| rN,
| (
| CASE WHEN cN % 2 = 0 then (cN / 2) ELSE (cN / 2) + 1 end
| ) as m1,
| (cN / 2) + 1 as m2
| FROM
| (
| SELECT
| DESTINATION_ID,
| LOCATION_ID,
| LATITUDE,
| row_number() OVER (
| PARTITION BY DESTINATION_ID,
| LOCATION_ID
| ORDER BY
| LATITUDE
| ) as rN,
| count(LATITUDE) OVER (PARTITION BY DESTINATION_ID, LOCATION_ID) as cN
| FROM
| people
| ) s
| ) r
|WHERE
| rN BETWEEN m1
| and m2
|GROUP BY
| DESTINATION_ID,
| LOCATION_ID
""".stripMargin)
.show(false)
/**
* +--------------+-----------+------+
* |DESTINATION_ID|LOCATION_ID|median|
* +--------------+-----------+------+
* +--------------+-----------+------+
*/
You need to check your query or input, its not providing any output
check IF THE BELOW QUERY HELPS -
spark.sql(
"""
|SELECT *
|FROM people k NATURAL JOIN
|(SELECT
| DESTINATION_ID,
| LOCATION_ID,
| avg(LATITUDE) as median
|FROM
| (
| SELECT
| DESTINATION_ID,
| LOCATION_ID,
| LATITUDE,
| rN,
| (
| CASE WHEN cN % 2 = 0 then (cN / 2) ELSE (cN / 2) - 1 end
| ) as m1,
| (cN / 2) + 1 as m2
| FROM
| (
| SELECT
| DESTINATION_ID,
| LOCATION_ID,
| LATITUDE,
| row_number() OVER (
| PARTITION BY DESTINATION_ID,
| LOCATION_ID
| ORDER BY
| LATITUDE
| ) as rN,
| count(LATITUDE) OVER (PARTITION BY DESTINATION_ID, LOCATION_ID) as cN
| FROM
| people
| ) s
| ) r
|WHERE
| rN BETWEEN m1
| and m2
|GROUP BY
| DESTINATION_ID,
| LOCATION_ID
| ) t
""".stripMargin)
.show(false)
/**
* +---------------+------------+-------------------+-------------------+
* |DESTINATION_ID |LOCATION_ID |LATITUDE |median |
* +---------------+------------+-------------------+-------------------+
* |ENSG00000111537|WEST_0001845|0.07092000000000001|0.07092000000000001|
* |ENSG00000257017|WEST_0001397|0.07092000000000001|0.07092000000000001|
* |ENSG00000103222|EAST_0000565|0.07056000000000001|0.07056000000000001|
* |ENSG00000108984|EAST_0000574|0.07056000000000001|0.07056000000000001|
* |ENSG00000112715|EAST_0000616|0.07092000000000001|0.07092000000000001|
* |ENSG00000113522|NORTH_790 |0.07056000000000001|0.07056000000000001|
* |ENSG00000118137|EAST_0000508|0.07092000000000001|0.07092000000000001|
* |ENSG00000146648|EAST_0000616|0.07092000000000001|0.07092000000000001|
* |ENSG00000159640|NORTH_797 |0.07092000000000001|0.07092000000000001|
* |ENSG00000181965|EAST_1001951|0.07056000000000001|0.07056000000000001|
* |ENSG00000257017|EAST_0000182|0.07092000000000001|0.07092000000000001|
* +---------------+------------+-------------------+-------------------+
*/

Get value for latest record incase of multiple records for same group

I have a dataset which will have multiple records for an id column field grouped on other columns. For this dataset, I want to derive a new column only for the latest record of each group. I was using a case statement to derive the new column and union to get the value for the latest record. I was thinking to avoid using UNION as it is an expensive operation in spark-sql.
Input:
person_id order_id order_ts order_amt
1 1 2020-01-01 10:10:10 10
1 2 2020-01-01 10:15:15 15
2 3 2020-01-01 10:10:10 0
2 4 2020-01-01 10:15:15 15
From the above input, person_id 1 has two orders (1,2) and person_id 2 has two orders (3,4). I want to derive a column for only latest order for a given person.
Expected Output:
person_id order_id order_ts order_amt valid_order
1 1 2020-01-01 10:10:10 10 N
1 2 2020-01-01 10:15:15 15 Y
2 3 2020-01-01 10:10:10 0 N
2 4 2020-01-01 10:15:15 15 Y
I tried below query to get the output using UNION in the query:
select person_id, order_id, order_ts, order_amt, valid_order
from
(
select *, row_number() over(partition by order_id order by derive_order) as rnk
from
(
select person_id, order_id, order_ts, order_amt, 'N' as valid_order, 'before' as derive_order
from test_table
UNION
select person_id, order_id, order_ts, order_amt,
case when order_amt is not null and order_amt >0 then 'Y' else 'N' end as valid_order,
'after' as derive_order
from
(
select *, row_number() over(partition by person_id order by order_ts desc) as rnk
from test_table
) where rnk = 1
) final
) where rnk = 1 order by person_id, order_id;
I also got the same output using a combination of left outer join and inner join.
Join Query:
select final.person_id, final.order_id, final.order_ts, final.order_amt,
case when final.valid_order is null then 'N' else final.valid_order end as valid_order
from
(
select c.person_id, c.order_id, c.order_ts, c.order_amt, d.valid_order from test_table c
left outer join
(
select a.*, case when a.order_amt is not null and a.order_amt >0 then 'Y' else 'N' end as valid_order
from test_table a
inner join
(
select person_id, max(order_id) as order_id from test_table group by 1
) b on a.person_id = b.person_id and a.order_id = b.order_id
) d on c.order_id = d.order_id
) final order by person_id, order_id;
Our input dataset will have around 20Million records. Is there a better-optimized way to get the same output apart from the above queries.
Any help would be appreciated.
check if it helps-
val data =
"""
|person_id | order_id | order_ts |order_amt
| 1 | 1 | 2020-01-01 10:10:10 | 10
| 1 | 2 | 2020-01-01 10:15:15 | 15
| 2 | 3 | 2020-01-01 10:10:10 | 0
| 2 | 4 | 2020-01-01 10:15:15 | 15
""".stripMargin
val stringDS = data.split(System.lineSeparator())
.map(_.split("\\|").map(_.replaceAll("""^[ \t]+|[ \t]+$""", "")).mkString(","))
.toSeq.toDS()
val df = spark.read
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.option("nullValue", "null")
.csv(stringDS)
df.printSchema()
df.show(false)
/**
* root
* |-- person_id: integer (nullable = true)
* |-- order_id: integer (nullable = true)
* |-- order_ts: timestamp (nullable = true)
* |-- order_amt: integer (nullable = true)
*
* +---------+--------+-------------------+---------+
* |person_id|order_id|order_ts |order_amt|
* +---------+--------+-------------------+---------+
* |1 |1 |2020-01-01 10:10:10|10 |
* |1 |2 |2020-01-01 10:15:15|15 |
* |2 |3 |2020-01-01 10:10:10|0 |
* |2 |4 |2020-01-01 10:15:15|15 |
* +---------+--------+-------------------+---------+
*/
Using spark DSL
df.withColumn("latest", max($"order_ts").over(Window.partitionBy("person_id")))
.withColumn("valid_order", when(unix_timestamp($"latest") - unix_timestamp($"order_ts") =!= 0, lit("N"))
.otherwise(lit("Y"))
)
.show(false)
/**
* +---------+--------+-------------------+---------+-------------------+-----------+
* |person_id|order_id|order_ts |order_amt|latest |valid_order|
* +---------+--------+-------------------+---------+-------------------+-----------+
* |2 |3 |2020-01-01 10:10:10|0 |2020-01-01 10:15:15|N |
* |2 |4 |2020-01-01 10:15:15|15 |2020-01-01 10:15:15|Y |
* |1 |1 |2020-01-01 10:10:10|10 |2020-01-01 10:15:15|N |
* |1 |2 |2020-01-01 10:15:15|15 |2020-01-01 10:15:15|Y |
* +---------+--------+-------------------+---------+-------------------+-----------+
*/
Using SPARK SQL
// Spark SQL
df.createOrReplaceTempView("order_table")
spark.sql(
"""
|select person_id, order_id, order_ts, order_amt, latest,
| case when (unix_timestamp(latest) - unix_timestamp(order_ts) != 0) then 'N' else 'Y' end as valid_order
| from
| (select person_id, order_id, order_ts, order_amt, max(order_ts) over (partition by person_id) as latest FROM order_table) a
""".stripMargin)
.show(false)
/**
* +---------+--------+-------------------+---------+-------------------+-----------+
* |person_id|order_id|order_ts |order_amt|latest |valid_order|
* +---------+--------+-------------------+---------+-------------------+-----------+
* |2 |3 |2020-01-01 10:10:10|0 |2020-01-01 10:15:15|N |
* |2 |4 |2020-01-01 10:15:15|15 |2020-01-01 10:15:15|Y |
* |1 |1 |2020-01-01 10:10:10|10 |2020-01-01 10:15:15|N |
* |1 |2 |2020-01-01 10:15:15|15 |2020-01-01 10:15:15|Y |
* +---------+--------+-------------------+---------+-------------------+-----------+
*/
It can be done without joins or union. Also this condition a.order_amt is not null and a.order_amt >0 is redundant because if amount > 0 it is already NOT NULL.
select person_id, order_id, order_ts, order_amt,
case when rn=1 and order_amt>0 then 'Y' else 'N' end as valid_order
from
(
select person_id, order_id, order_ts, order_amt,
row_number() over(partition by person_id order by order_ts desc) as rn
from test_table a
) s

SparkSQL - Extract multiple regex matches (using SQL only)

I have a dataset of SQL queries in raw text and another with a regular expression of all the possible table names:
# queries
+-----+----------------------------------------------+
| id | query |
+-----+----------------------------------------------+
| 1 | select * from table_a, table_b |
| 2 | select * from table_c join table_d... |
+-----+----------------------------------------------+
# regexp
'table_a|table_b|table_c|table_d'
And I wanted the following result:
# expected result
+-----+----------------------------------------------+
| id | tables |
+-----+----------------------------------------------+
| 1 | [table_a, table_b] |
| 2 | [table_c, table_d] |
+-----+----------------------------------------------+
But using the following SQL in Spark, all I get is the first match...
select
id,
regexp_extract(query, 'table_a|table_b|table_c|table_d') as tables
from queries
# actual result
+-----+----------------------------------------------+
| id | tables |
+-----+----------------------------------------------+
| 1 | table_a |
| 2 | table_c |
+-----+----------------------------------------------+
Is there any way to do this using only Spark SQL? This is the function I am using https://people.apache.org/~pwendell/spark-nightly/spark-master-docs/latest/api/sql/#regexp_extract
EDIT
I would also accept a solution that returned the following:
# alternative solution
+-----+----------------------------------------------+
| id | tables |
+-----+----------------------------------------------+
| 1 | table_a |
| 1 | table_b |
| 2 | table_c |
| 2 | table_d |
+-----+----------------------------------------------+
SOLUTION
#chlebek solved this below. I reformatted his SQL using CTEs for better readability:
with
split_queries as (
select
id,
explode(split(query, ' ')) as col
from queries
),
extracted_tables as (
select
id,
regexp_extract(col, 'table_a|table_b|table_c|table_d', 0) as rx
from split_queries
)
select
id,
collect_set(rx) as tables
from extracted_tables
where rx != ''
group by id
Bear in mind that the split(query, ' ') part of the query will split your SQL only by spaces. If you have other things such as tabs, line breaks, comments, etc., you should deal with these before or when splitting.
If you have only a few values to check you can achieve it using contains function instead of regexp:
val names = Seq("table_a","table_b","table_c","table_d")
def c(col: Column) = names.map(n => when(col.contains(n),n).otherwise(""))
df.select('id,array_remove(array(c('query):_*),"").as("result")).show(false)
but using regexp it will looks like below (Spark SQL API):
df.select('id,explode(split('query," ")))
.select('id,regexp_extract('col,"table_a|table_b|table_c|table_d",0).as("rx"))
.filter('rx=!="")
.groupBy('id)
.agg(collect_list('rx))
and it could be translated to below SQL query:
select id, collect_list(rx) from
(select id, regexp_extract(col,'table_a|table_b|table_c|table_d',0) as rx from
(select id, explode(split(query,' ')) as col from df) q1
) q2
where rx != '' group by id
so output will be:
+---+------------------+
| id| collect_list(rx)|
+---+------------------+
| 1|[table_a, table_b]|
| 2|[table_c, table_d]|
+---+------------------+
As you are using spark-sql, you can use sql parser & it will do job for you.
def getTables(query: String): Seq[String] = {
val logicalPlan = spark.sessionState.sqlParser.parsePlan(query)
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
logicalPlan.collect { case r: UnresolvedRelation => r.tableName }
}
val query = "select * from table_1 as a left join table_2 as b on
a.id=b.id"
scala> getTables(query).foreach(println)
table_1
table_2
You can register 'getTables' as udf & use in query
You can use another SQL function available in Spark called collect_list https://docs.databricks.com/spark/latest/spark-sql/language-manual/functions.html#collect_list. You can find another sample https://mungingdata.com/apache-spark/arraytype-columns/
Basically, applying to your code it should be
val df = spark.sql("select 1 id, 'select * from table_a, table_b' query" )
val df1 = spark.sql("select 2 id, 'select * from table_c join table_d' query" )
val df3 = df.union(df1)
df3.createOrReplaceTempView("tabla")
spark.sql("""
select id, collect_list(tables) from (
select id, explode(split(query, ' ')) as tables
from tabla)
where tables like 'table%' group by id""").show
The output will be
+---+--------------------+
| id|collect_list(tables)|
+---+--------------------+
| 1| [table_a,, table_b]|
| 2| [table_c, table_d]|
+---+--------------------+
Hope this helps
If you are on spark>=2.4 then you can remove exploding and collecting the same operations by using higher order functions on array and without any subqueries-
Load the test data
val data =
"""
|id | query
|1 | select * from table_a, table_b
|2 | select * from table_c join table_d on table_c.id=table_d.id
""".stripMargin
val stringDS = data.split(System.lineSeparator())
.map(_.split("\\|").map(_.replaceAll("""^[ \t]+|[ \t]+$""", "")).mkString(";"))
.toSeq.toDS()
val df = spark.read
.option("sep", ";")
.option("inferSchema", "true")
.option("header", "true")
.option("nullValue", "null")
.csv(stringDS)
df.printSchema()
df.show(false)
/**
* root
* |-- id: integer (nullable = true)
* |-- query: string (nullable = true)
*
* +---+-----------------------------------------------------------+
* |id |query |
* +---+-----------------------------------------------------------+
* |1 |select * from table_a, table_b |
* |2 |select * from table_c join table_d on table_c.id=table_d.id|
* +---+-----------------------------------------------------------+
*/
Extract the tables from query
// spark >= 2.4.0
df.createOrReplaceTempView("queries")
spark.sql(
"""
|select id,
| array_distinct(
| FILTER(
| split(query, '\\.|=|\\s+|,'), x -> x rlike 'table_a|table_b|table_c|table_d'
| )
| )as tables
|FROM
| queries
""".stripMargin)
.show(false)
/**
* +---+------------------+
* |id |tables |
* +---+------------------+
* |1 |[table_a, table_b]|
* |2 |[table_c, table_d]|
* +---+------------------+
*/

Spark structured streaming drop duplicates keep last

I would like to maintain a streaming dataframe that get "update".
To do so I will use dropDuplicates.
But dropDuplicates drop the latest change.
How can I retain the last only?
Assuming you need to select the last record on id column by removing other duplicates, you can use the window functions and filter on row_number = count. Check this out
scala> val df = Seq((120,34.56,"2018-10-11"),(120,65.73,"2018-10-14"),(120,39.96,"2018-10-20"),(122,11.56,"2018-11-20"),(122,24.56,"2018-10-20")).toDF("id","amt","dt")
df: org.apache.spark.sql.DataFrame = [id: int, amt: double ... 1 more field]
scala> val df2=df.withColumn("dt",'dt.cast("date"))
df2: org.apache.spark.sql.DataFrame = [id: int, amt: double ... 1 more field]
scala> df2.show(false)
+---+-----+----------+
|id |amt |dt |
+---+-----+----------+
|120|34.56|2018-10-11|
|120|65.73|2018-10-14|
|120|39.96|2018-10-20|
|122|11.56|2018-11-20|
|122|24.56|2018-10-20|
+---+-----+----------+
scala> df2.createOrReplaceTempView("ido")
scala> spark.sql(""" select id,amt,dt,row_number() over(partition by id order by dt) rw, count(*) over(partition by id) cw from ido """).show(false)
+---+-----+----------+---+---+
|id |amt |dt |rw |cw |
+---+-----+----------+---+---+
|122|24.56|2018-10-20|1 |2 |
|122|11.56|2018-11-20|2 |2 |
|120|34.56|2018-10-11|1 |3 |
|120|65.73|2018-10-14|2 |3 |
|120|39.96|2018-10-20|3 |3 |
+---+-----+----------+---+---+
scala> spark.sql(""" select id,amt,dt from (select id,amt,dt,row_number() over(partition by id order by dt) rw, count(*) over(partition by id) cw from ido) where rw=cw """).show(false)
+---+-----+----------+
|id |amt |dt |
+---+-----+----------+
|122|11.56|2018-11-20|
|120|39.96|2018-10-20|
+---+-----+----------+
scala>
If you want to sort on dt descending you can just give "order by dt desc" in the over(0 clause.. Does this help?

hive comment get NULL in spark-sql

I execute "ALTER TABLE table_name CHANGE COLUMN col_name col_name column_type COMMENT col_comment;" to add comment to my table, and I succeeded.In hive I desc my table and I got like this:
hive> desc mytable;
+---------+----------+---------+
|col_name |data_type |comment |
|---------+----------+---------+
|col1 |string |name |
+---------+----------+---------+
but in spark-sql the comment is gone:
spark-sql> desc mytable;
+---------+----------+---------+
|col_name |data_type |comment |
|---------+----------+---------+
|col1 |string |null |
+---------+----------+---------+
By the way, I use MySQL for metastore.
How can I get the comment in spark-sql?

Resources