SPARK SQL CREATE VIEW - apache-spark

I have noticed that there seems to be some max length of the create view statement. Below is a SQL query I can run (select statement). If I put a create view in front, it works as long as I limit the columns in the sub-query. The view itself would only contain one column. I have provided this for example. I am running my sql through thrift server. Is there any way to get around this issue? Thanks in advance!
Working
select p from
(
select p
from
(select p,concat(
p1,',',p2,',',p3,',',p4,',',p5,',',p6,',',p7,',',p8,',',p9,',',p10,',',p11,',',p12,',',p13,',',p14,',',p15,',',p16,',',p17,',',p18,',',p19,',',p20,',',
p21,',',p22,',',p23,',',p24,',',p25,',',p26,',',p27,',',p28,',',p29,',',p30,',',p31,',',p32,',',p33,',',p34,',',p35,',',p36,',',p37,',',p38,',',p39,',',p40,',',
p41,',',p42,',',p43,',',p44,',',p45,',',p46,',',p47,',',p48,',',p49,',',p50,',',p51,',',p52,',',p53,',',p54,',',p55,',',p56,',',p57,',',p58,',',p59,',',p60,',',
p61,',',p62,',',p63,',',p64,',',p65,',',p66,',',p67,',',p68,',',p69,',',p70,',',p71,',',p72,',',p73,',',p74,',',p75,',',p76,',',p77,',',p78,',',p79,',',p80,',',
p81,',',p82,',',p83,',',p84,',',p85,',',p86,',',p87,',',p88,',',p89,',',p90,',',p91,',',p92,',',p93,',',p94,',',p95,',',p96,',',p97,',',p98,',',p99,',',p100,',',
p101,',',p102,',',p103,',',p104,',',p105,',',p106,',',p107,',',p108,',',p109,',',p110,',',p111,',',p112,',',p113,',',p114,',',p115,',',p116,',',p117,',',p118,',',p119,',',p120,',',
p121,',',p122,',',p123,',',p124,',',p125,',',p126,',',p127,',',p128,',',p129,',',p130,',',p131,',',p132,',',p133,',',p134,',',p135,',',p136,',',p137,',',p138,',',p139,',',p140,',',
p141,',',p142,',',p143,',',p144,',',p145,',',p146,',',p147,',',p148,',',p149,',',p150,',',p151,',',p152,',',p153,',',p154,',',p155,',',p156,',',p157,',',p158,',',p159,',',p160,',',
p161,',',p162,',',p163,',',p164,',',p165,',',p166,',',p167,',',p168,',',p169,',',p170,',',p171,',',p172,',',p173,',',p174,',',p175,',',p176,',',p177,',',p178,',',p179,',',p180,',',
p181,',',p182,',',p183,',',p184,',',p185,',',p186,',',p187,',',p188,',',p189,',',p190,',',p191,',',p192,',',p193,',',p194,',',p195,',',p196,',',p197,',',p198,',',p199,',',p200,',',
p201,',',p202,',',p203,',',p204,',',p205,',',p206,',',p207,',',p208,',',p209,',',p210,',',p211,',',p212,',',p213,',',p214,',',p215,',',p216,',',p217,',',p218,',',p219,',',p220,',',
p221,',',p222,',',p223,',',p224,',',p225,',',p226,',',p227,',',p228,',',p229,',',p230,',',p231,',',p232,',',p233,',',p234,',',p235,',',p236,',',p237,',',p238,',',p239,',',p240,',',
p241,',',p242,',',p243,',',p244,',',p245,',',p246,',',p247,',',p248,',',p249,',',p250,',',p251,',',p252,',',p253,',',p254,',',p255,',',p256,',',p257,',',p258,',',p259,',',p260,',',
p261,',',p262,',',p263,',',p264,',',p265,',',p266,',',p267,',',p268,',',p269,',',p270,',',p271,',',p272,',',p273,',',p274,',',p275,',',p276,',',p277,',',p278,',',p279) vector
from table1) a
) c
Not working
CREATE VIEW TEST AS
select p from
(
select p
from
(select p,concat(
p1,',',p2,',',p3,',',p4,',',p5,',',p6,',',p7,',',p8,',',p9,',',p10,',',p11,',',p12,',',p13,',',p14,',',p15,',',p16,',',p17,',',p18,',',p19,',',p20,',',
p21,',',p22,',',p23,',',p24,',',p25,',',p26,',',p27,',',p28,',',p29,',',p30,',',p31,',',p32,',',p33,',',p34,',',p35,',',p36,',',p37,',',p38,',',p39,',',p40,',',
p41,',',p42,',',p43,',',p44,',',p45,',',p46,',',p47,',',p48,',',p49,',',p50,',',p51,',',p52,',',p53,',',p54,',',p55,',',p56,',',p57,',',p58,',',p59,',',p60,',',
p61,',',p62,',',p63,',',p64,',',p65,',',p66,',',p67,',',p68,',',p69,',',p70,',',p71,',',p72,',',p73,',',p74,',',p75,',',p76,',',p77,',',p78,',',p79,',',p80,',',
p81,',',p82,',',p83,',',p84,',',p85,',',p86,',',p87,',',p88,',',p89,',',p90,',',p91,',',p92,',',p93,',',p94,',',p95,',',p96,',',p97,',',p98,',',p99,',',p100,',',
p101,',',p102,',',p103,',',p104,',',p105,',',p106,',',p107,',',p108,',',p109,',',p110,',',p111,',',p112,',',p113,',',p114,',',p115,',',p116,',',p117,',',p118,',',p119,',',p120,',',
p121,',',p122,',',p123,',',p124,',',p125,',',p126,',',p127,',',p128,',',p129,',',p130,',',p131,',',p132,',',p133,',',p134,',',p135,',',p136,',',p137,',',p138,',',p139,',',p140,',',
p141,',',p142,',',p143,',',p144,',',p145,',',p146,',',p147,',',p148,',',p149,',',p150,',',p151,',',p152,',',p153,',',p154,',',p155,',',p156,',',p157,',',p158,',',p159,',',p160,',',
p161,',',p162,',',p163,',',p164,',',p165,',',p166,',',p167,',',p168,',',p169,',',p170,',',p171,',',p172,',',p173,',',p174,',',p175,',',p176,',',p177,',',p178,',',p179,',',p180,',',
p181,',',p182,',',p183,',',p184,',',p185,',',p186,',',p187,',',p188,',',p189,',',p190,',',p191,',',p192,',',p193,',',p194,',',p195,',',p196,',',p197,',',p198,',',p199,',',p200,',',
p201,',',p202,',',p203,',',p204,',',p205,',',p206,',',p207,',',p208,',',p209,',',p210,',',p211,',',p212,',',p213,',',p214,',',p215,',',p216,',',p217,',',p218,',',p219,',',p220,',',
p221,',',p222,',',p223,',',p224,',',p225,',',p226,',',p227,',',p228,',',p229,',',p230,',',p231,',',p232,',',p233,',',p234,',',p235,',',p236,',',p237,',',p238,',',p239,',',p240,',',
p241,',',p242,',',p243,',',p244,',',p245,',',p246,',',p247,',',p248,',',p249,',',p250,',',p251,',',p252,',',p253,',',p254,',',p255,',',p256,',',p257,',',p258,',',p259,',',p260,',',
p261,',',p262,',',p263,',',p264,',',p265,',',p266,',',p267,',',p268,',',p269,',',p270,',',p271,',',p272,',',p273,',',p274,',',p275,',',p276,',',p277,',',p278,',',p279) vector
from table1) a
) c
WORKING
CREATE VIEW TEST AS
select p from
(
select p
from
(select p,concat(
p1,',',p2,',',p3,',',p4,',',p5,',',p6,',',p7,',',p8,',',p9,',',p10,',',p11,',',p12,',',p13,',',p14,',',p15,',',p16,',',p17,',',p18,',',p19,',',p20,',',
p21,',',p22,',',p23,',',p24,',',p25,',',p26,',',p27,',',p28,',',p29,',',p30,',',p31,',',p32,',',p33,',',p34,',',p35,',',p36,',',p37,',',p38,',',p39,',',p40,',',
p41,',',p42,',',p43,',',p44,',',p45,',',p46,',',p47,',',p48,',',p49,',',p50,',',p51,',',p52,',',p53,',',p54,',',p55,',',p56,',',p57,',',p58,',',p59,',',p60,',',
p61,',',p62,',',p63,',',p64,',',p65,',',p66,',',p67,',',p68,',',p69,',',p70,',',p71,',',p72,',',p73,',',p74,',',p75,',',p76,',',p77,',',p78,',',p79,',',p80,',',
p81,',',p82,',',p83,',',p84,',',p85,',',p86,',',p87,',',p88,',',p89,',',p90,',',p91,',',p92,',',p93,',',p94,',',p95,',',p96,',',p97,',',p98,',',p99,',',p100,',') vector
from table1) a
) c

Use JDBC driver included with Spark. I can compile view now. I am using SQL
Use SQL Squirrel instead with JDBC driver to connect and run queries. I was using the Microsoft SPARK ODBC driver. the JDBC one works fine.

Related

Spark SQL persistent view over jdbc data source

I want to create a persistent (global) view in spark sql that gets data from an underlying jdbc database connection. It works fine when I use a temporary (session-scoped) view as shown below but fails when trying to create a regular (persistent and global) view.
I don't understand why the latter should not work but couldn't find any docs/hints as all examples are always done with temporary views. Technically, I cannot see why it shouldn't work as the data is properly retrieved from jdbc source in the temporary view and thus it should not matter if I wanted to "store" the query in a persistent view so that whenever calling the view it would retrieve data directly from jdbc source.
Config.
tbl_in = myjdbctable
tbl_out = myview
db_user = 'myuser'
db_pw = 'mypw'
jdbc_url = 'jdbc:sqlserver://myserver.domain:1433;database=mydb'
This works.
query = f"""
create or replace temporary view {tbl_out}
using jdbc
options(
dbtable '{tbl_in}',
user '{db_user}',
password '{db_pw}',
url '{jdbc_url}'
)
"""
spark.sql(query)
> DataFrame[]
This does not work.
query = f"""
create or replace view {tbl_out}
using jdbc
options(
dbtable '{tbl_in}',
user '{db_user}',
password '{db_pw}',
url '{jdbc_url}'
)
"""
spark.sql(query)
> ParseException:
Error.
ParseException:
mismatched input 'using' expecting {'(', 'UP_TO_DATE', 'AS', 'COMMENT', 'PARTITIONED', 'TBLPROPERTIES'}(line 3, pos 0)
== SQL ==
create or replace view myview
using jdbc
^^^
options(
dbtable 'myjdbctable',
user 'myuser',
password '[REDACTED]',
url 'jdbc:sqlserver://myserver.domain:1433;database=mydb'
)
TL;DR: A spark sql table over jdbc source behaves like a view and so can be used like one.
It seems my assumptions about jdbc tables in spark sql were flawed. It turns out that a sql table with a jdbc source (i.e. created via using jdbc) is actually a live query against the jdbc source (and not a one-off jdbc query during table creation as I assumed). In my mind it actually behaves like a view then. That means if the underlying jdbc source changes (e.g. new entries in a column) this is reflected in the spark sql table on read (e.g. select from) without having to re-create the table.
It follows that the spark sql table over jdbc source satisfies my requirements of having an always up2date reflection of the underlying table/sql object in the jdbc source. Usually, I would use a view for that. Maybe this is the reason why there is no persistent view over a jdbc source but only temporary views (which of course still make sense as they are session-scoped). It should be noted that the spark sql jdbc table behaves like a view which may be surprising, in particular:
if you add a column in underlying jdbc table, it will not show up in spark sql table
if you remove a column from underlying jdbc table, an error will occur when spark sql table is accessed (assuming the removed column was present during spark sql table creation)
if you remove the underlying jdbc table, an error will occur when spark sql table is accessed
The input of spark.sql should be DML (Data Manipulation Language). Its output is a dataframe.
In terms of best practices, you should avoid using DDL (Data Definition Language) with spark.sql. Even if some statements may work, that's not meant to be used this way.
If you want to use DDL, simply connect to your DB using python packages.
If you want to create a temp view in spark, do it using spark syntaxe createTempView

Azure Data Factory Error: "incorrect syntax near"

I'm trying to do a simple incremental update from an on-prem database as source to Azure SQL database based on a varchar column called "RP" in On-Prem database that contains "date+staticdescription" for example: "20210314MetroFactory"
1- I've created a Lookup activity called Lookup1 using a table created in Azure SQL Database and uses this Query
"Select RP from SubsetwatermarkTable"
2- I've created a Copy data activity where the source settings have this Query
"Select * from SourceDevSubsetTable WHERE RP NOT IN '#{activity('Lookup1').output.value}'"
When debugging -- I'm getting the error:
Failure type: User configuration issue
Details: Failure happened on 'Source' side.
'Type=System.Data.SqlClient.SqlException,Message=Incorrect syntax near
'[{"RP":"20210307_1Plant
1KAO"},{"RP":"20210314MetroFactory"},{"RP":"20210312MetroFactory"},{"RP":"20210312MetroFactory"},{"RP":"2'.,Source=.Net
SqlClient Data
Provider,SqlErrorNumber=102,Class=15,ErrorCode=-2146232060,State=1,Errors=[{Class=15,Number=102,State=1,Message=Incorrect
syntax near
'[{"RP":"20210311MetroFactory"},{"RP":"20210311MetroFactory"},{"RP":"202103140MetroFactory"},{"RP":"20210308MetroFactory"},{"RP":"2'.,},],'
Can anyone tell me what I am doing wrong and how to fix it even if it requires creating more activities.
Note: There is no LastModifiedDate column in the table. Also I haven't yet created the StoredProcedure that will update the Lookup table when it is done with the incremental copy.
Steve is right as to why it is failling and the query you need in the Copy Data.
As he says, you want a comma-separated list of quoted values to use in your IN clause.
You can get this more easily though - from your Lookup directly using this query:-
select stuff(
(
select ','''+rp+''''
from subsetwatermarktable
for xml path('')
)
, 1, 1, ''
) as in_clause
The sub-query gets the comma separated list with quotes around each rp-value, but has a spurious comma at the start - the outer query with stuff removes this.
Now tick the First Row Only box on the Lookup and change your Copy Data source query to:
select *
from SourceDevSubsetTable
where rp not in (#{activity('lookup').output.firstRow.in_clause})
The result of #activity('Lookup1').output.value is an array like your error shows
[{"RP":"20210307_1Plant
1KAO"},{"RP":"20210314MetroFactory"},{"RP":"20210312MetroFactory"},{"RP":"20210312MetroFactory"},{"RP":"2'.,Source=.Net
SqlClient Data
Provider,SqlErrorNumber=102,Class=15,ErrorCode=-2146232060,State=1,Errors=[{Class=15,Number=102,State=1,Message=Incorrect
syntax near
'[{"RP":"20210311MetroFactory"},{"RP":"20210311MetroFactory"},{"RP":"202103140MetroFactory"},{"RP":"20210308MetroFactory"},{"RP":"2'.,},]
However, your SQL should be like this:Select * from SourceDevSubsetTable WHERE RP NOT IN ('20210307_1Plant 1KAO','20210314MetroFactory',...).
To achieve this in ADF, you need to do something like this:
create three variables like the following screenshot:
loop your result of #activity('Lookup1').output.value and append 'item().RP' to arrayvalues:
expression:#activity('Lookup1').output.value
expression:#concat(variables('apostrophe'),item().RP,variables('apostrophe'))
3.cast arrayvalues to string and add parentheses by Set variable activity
expression:#concat('(',join(variables('arrayvalues'),','),')')
4.copy to your Azure SQL database
expression:Select * from SourceDevSubsetTable WHERE RP NOT IN #{variables('stringvalues')}

MEMSql join with uniouned returns 1899 bad distributed join plan

I have events that are inserted to table game_events and other events that are inserted to table rejected_events.
I want to union them and then do bunch of joins with other data of other tables.
I run this query directly on the memSQL console:
SELECT combinedEvents.*, win.*
FROM (
SELECT event_id, action, status
FROM events.game_events
WHERE event_arrival_time BETWEEN '2019-12-17T00:00:00Z' AND '2019-12-30T23:59:59.999Z'
UNION
SELECT event_id, action,status
FROM rejected.events
WHERE event_arrival_time BETWEEN '2019-12-17T00:00:00Z' AND '2019-12-30T23:59:59.999Z'
) AS combinedEvents
LEFT JOIN winner.winner_data AS win ON combinedEvents.event_id = win.event_id AND win.status = 'ACCEPTED'
I get from mem:
Error Code: 1889. Bad distributed join plan: leaf select contains sharded tables of multiple databases. Please contact technical support.
If I remove the JOIN - It works properly.
Any ideas?
Thanks and best regards,
Ido
It looks like it got solved on the MemSQL Forums and the solution is to move the union to the right of the join.

Why does 'get_json_object' return different results when run in spark and sql tool

I have developed a hive query that uses lateral views and get_json_object to unpack some json. The query works well enough using a jdbc client (dbvisualizer) against a hive database but when run as spark sql from a java application, on the same data, it returns nothing.
I have tracked down the problem to differences in what the function 'get_json_object' returns.
The issue can be illustrated by this type of query
select concat_ws( "|", get_json_object('{"product_offer":[
{"productName":"Plan A"},
{"productName":"Plan B"}]}',
'$.product_offer.productName') )
When run in dbvisualizer against a Hive database I get an array of the 2 product names in the json array: ["Plan A","Plan B"].
When the same query is run as spark sql from a java application, null is returned.
I have noticed another difference: the path '$.product_offer[0].productName' returns 'Plan A' in db visualizer and nothing in spark.
The path to extract the array of product names is
select concat_ws( "|", get_json_object('{"product_offer":[{"productName":"Plan A"},{"productName":"Plan B"}]}', '$.product_offer[*].productName'
which works both in spark dbvisualizer.

Composite key in Cassandra with Pig

We have a CQL table that looks something like this:
CREATE table data (
occurday text,
seqnumber int,
occurtimems bigint,
unique bigint,
fields map<text, text>,
primary key ((occurday, seqnumber), occurtimems, unique)
)
I can query this table from cqlsh like this:
select * from data where seqnumber = 10 AND occurday = '2013-10-01';
This query works and returns the expected data.
If I execute this query as part of a LOAD from within Pig, however, things don't work.
-- Need to URL encode the query
data = LOAD 'cql://ks/data?where_clause=seqnumber%3D10%20AND%20occurday%3D%272013-10-01%27' USING CqlStorage();
gives
InvalidRequestException(why:seqnumber cannot be restricted by more than one relation if it includes an Equal)
at org.apache.cassandra.thrift.Cassandra$prepare_cql3_query_result.read(Cassandra.java:39567)
at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:78)
at org.apache.cassandra.thrift.Cassandra$Client.recv_prepare_cql3_query(Cassandra.java:1625)
at org.apache.cassandra.thrift.Cassandra$Client.prepare_cql3_query(Cassandra.java:1611)
at org.apache.cassandra.hadoop.cql3.CqlPagingRecordReader$RowIterator.prepareQuery(CqlPagingRecordReader.java:591)
at org.apache.cassandra.hadoop.cql3.CqlPagingRecordReader$RowIterator.executeQuery(CqlPagingRecordReader.java:621)
Shouldn't these behave the same? Why is the version through Pig failing where the straight cqlsh command works?
Hadoop is using CqlPagingRecordReader to try to load your data. This is leading to queries that are not identical to what you have entered. The paging record reader is trying to obtain small slices of Cassandra data at a time to avoid timeouts.
This means that your query is executed as
SELECT * FROM "data" WHERE token("occurday","seqnumber") > ? AND
token("occurday","seqnumber") <= ? AND occurday='A Great Day'
AND seqnumber=1 LIMIT 1000 ALLOW FILTERING
And this is why you are seeing your repeated key error. I'll submit a bug to the Cassandra Project.
Jira:
https://issues.apache.org/jira/browse/CASSANDRA-6151

Resources