How can we execute sql in python pandas :
SQL :
select
a.*,
b.vol1 / sum(vol1) over (
partition by a.sale, a.d_id,
a.month, a.p_id
) vol_r,
a.vol2* b.vol1/ sum(b.vol1) over (
partition by a.sale, a.d_id,
a.month, a.p_id
) vol_t
from
sales1 a
left join sales2 b on a.sale = b.sale
and a.d_id = b.d_id
and a.month = b.month
and a.p_id = b.p_id
I know one way is pandasql but getting error as sql involves partition by.
Related
I have the following SQL Query:
Select st.Value,
st.Id,
ntile(2) OVER (PARTITION BY St.Id, St.VarId ORDER By St.Sls),
AVG(St.Value) OVER (PARTITION BY St.Id, St.VarId ORDER By St.Sls, St.Date)
FROM table tb
INNER JOIN staging st on St.Id = tb.Id
I've tried to adapt this to Spark/PySpark using window function, my code is below:
windowSpec_1 = Window.partitionBy("staging.Id", "staging.VarId").orderBy("staging.Sls")
windowSpec_2 = Window.partitionBy("staging.Id", "staging.VarId").orderBy("staging.Sls", "staging.Date")
df= table.join(
staging,
on=f.col("staging.Id") == f.col("table.Id"),
how='inner'
).select(
f.col("staging.Value"),
f.ntile(2).over(windowSpec_1),
f.avg("staging.Value").over(windowSpec_2)
)
Although I'm getting the following error:
pyspark.sql.utils.AnalysisException: Can't extract value from Value#42928: need struct type but got decimal(16,6)
How Can I solve this problem? Is it necessary to group data?
Maybe you forgot to assign alias to staging?:
df= table.join(
staging.alias("staging"),
on=f.col("staging.Id") == f.col("table.Id"),
how='inner'
).select(
f.col("staging.Value"),
f.ntile(2).over(windowSpec_1),
f.avg("staging.Value").over(windowSpec_2)
)
I want to convert the below query's to a spark data frame (I am pretty new to spark):
-- Creating group number
select distinct *, DENSE_RANK() OVER(ORDER BY person_id, trust_id) AS group_number;
-- This is what I got so far for above
df = self.spark.sql("select person_id, trust_id, insurance_id, amount, time_of_app, place_of_app from {}".format(self.tables['people']))
df = df.withColumn("group_number", dense_rank().over(Window.partitionBy("person_id", "trust_id").OrderBy("person_id", "trust_id")))
-- Different query 1
where group_number in (select group_number from etl_table_people where code like 'H%') group by group_number having count(distinct amount) > 1;
-- Different query 2
where insurance_id = 'V94.12'
group by group_number having count(distinct amount) = 2;
What you are looking for is Window Spec Function of spark.
val windowSpec = Window.partitionBy("person_id","trust_id").orderBy(col("person_id").desc).orderBy(col("trust_id").desc)
df.withColumn("group_number", dense_rank() over windowSpec)
And you get your data frame using spark based on your Data Source. You can refer this if your source is Hive
Hi I am trying to query hive table from spark context.
my code:
from pyspark.sql import HiveContext
hive_context = HiveContext(sc)
bank = hive_context.table('select * from db.table_name')
bank.show()
simple queries like this works fine, without any error.
but when I try with below query.
query = """with table1 as ( select distinct a,b
from db_first.table_first
order by b )
--select * from table1 order by b
,c as ( select *
from db_first.table_two)
--select * from c
,d as ( select *
from c
where upper(e) = 'Y')
--select * from d
,f as ( select table1.b
,cast(regexp_extract(g,'(\\d+)-(A|B)-
(\\d+)(.*)',1) as Int) aid1
,regexp_extract(g,'(\\d+)-(A|B)-
(\\d+)(.*)',2) aid2
,cast(regexp_extract(g,'(\\d+)-(A|B)-
(\\d+)(.*)',3) as Int) aid3
,from_unixtime(cast(substr(lastdbupdatedts,1,10) as int),"yyyy-MM-dd
HH:mm:ss") lastupdts
,d.*
from d
left outer join table1
on d.hiba = table1.a)
select * from f order by b,aid1,aid2,aid3 limit 100"""
I get the below error, please help.
ParseExceptionTraceback (most recent call last)
<ipython-input-27-cedb6fad210d> in <module>()
3 hive_context = HiveContext(sc)
4 #bank = hive_context.table("bdalab.test_prodapt_inv")
----> 5 bank = hive_context.table(first)
ParseException: u"\nmismatched input '*' expecting <EOF>(line 1, pos 7)\n\n== SQL ==\nselect *
You need to use .sql method instead of .table method if we are using sql query.
1.Using .table method then we need to provide table name:
>>> hive_context.table("<db_name>.<table_name>").show()
2.Using .sql method then provide your with cte expression:
>>> first ="with cte..."
>>> hive_context.sql(first).show()
The following HiveQL code takes about 3 to 4 hours and I am trying effectively convert this into a pyspark data frame code. Any dataframe experts input is appreciated a lot.
INSERT overwrite table dlstage.DIBQtyRank_C11 PARTITION(fiscalyearmonth)
SELECT * FROM
(SELECT a.matnr, a.werks, a.periodstartdate, a.fiscalyear, a.fiscalmonth,b.dy_id, MaterialType,(COALESCE(a.salk3,0)) salk3,(COALESCE(a.lbkum,0)) lbkum, sum(a.valuatedquantity) AS valuatedquantity, sum(a.InventoryValue) AS InventoryValue,
rank() over (PARTITION by dy_id, werks, matnr order by a.max_date DESC) rnk, sum(stprs) stprs, max(peinh) peinh, fcurr,fiscalyearmonth
FROM dlstage.DIBmsegFinal a
LEFT JOIN dlaggr.dim_fiscalcalendar b ON a.periodstartdate=b.fmth_begin_dte WHERE a.max_date >= b.fmth_begin_dte AND a.max_date <= b.dy_id and
fiscalYearmonth = concat(fyr_id,lpad(fmth_nbr,2,0))
GROUP BY a.matnr, a.werks,dy_id, max_date, a.periodstartdate, a.fiscalyear, a.fiscalmonth, MaterialType, fcurr, COALESCE(a.salk3,0), COALESCE(a.lbkum,0),fiscalyearmonth) a
WHERE a.rnk=1 and a.fiscalYear = '%s'" %(year) + " and a.fiscalmonth ='%s'" %(mnth)
This is my statement:
val Porders = sqlContext.sql(
"""SELECT count(STATUS_CD)
FROM s_order
WHERE STATUS_CD = 'pending' AND ROW_ID IN
( SELECT so.ROW_ID FROM s_order so
JOIN s_order_item soi
ON so.ROW_ID = soi.ORDER_ID
JOIN s_order_type sot
ON so.ORDER_TYPE_ID = sot.ROW_ID
JOIN s_product sp
ON soi.PROD_ID = sp.ROW_ID
WHERE (sp.NAME like '%VIP%' OR sp.NAME like '%BIZ%' OR sp.NAME like '%UniFi%')
AND LOWER(sot.NAME) = 'new install')
""")
I receive the following error:
ERROR : java.lang.RuntimeException: [3.3] failure: identifier expected
( SELECT so.ROW_ID FROM s_order so JOIN s_order_item soi
^
What could be the reason?
The reason, why this happens, is that subqueries are not supported: see Spark-4226.
Even a query like
sqlContext.sql(
"""SELECT count(STATUS_CD)
FROM s_order
WHERE STATUS_CD = 'pending' AND ROW_ID IN
(SELECT * FROM s_order)
""")
does not work currently (speaking of Spark SQL 1.5.1)
Try to replace your subquery by a join, e.g. https://dev.mysql.com/doc/refman/5.1/en/rewriting-subqueries.html