Perform INSERT INTO ... SELECT in AWS GLUE - apache-spark

The following script populates a target table with the data fetched from a source table using pyspark.sql and runs without problems in AWS Glue:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
users.toDF().createOrReplaceTempView("users")
query_users = """
SELECT U.id
, signup_from
FROM users AS U
"""
users_df = spark.sql(query_users)
users_dynamicframe = DynamicFrame.fromDF(
users_df.repartition(1), glueContext, "users_dynamicframe"
)
users_output = glueContext.write_dynamic_frame.from_catalog(
frame=users_dynamicframe,
database="target",
table_name="target_users",
transformation_ctx="users_output",
)
job.commit()
Now, I would like to perform an INSERT INTO SELECT ... ON DUPLICATE KEY UPDATE ...
and I wrote the following script:
source_users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
target_users = glueContext.create_dynamic_frame.from_catalog(
database = "target", table_name = "target_users"
)
source_users.toDF().createOrReplaceTempView("source_users")
target_users.toDF().createOrReplaceTempView("target_users")
query = """
INSERT INTO target_users
SELECT U.id
, U.user_type
FROM source_users
on duplicate key update id=target_users.id
"""
target_output = spark.sql(query)
job.commit()
which returns the following
ParseException: "\nmismatched input 'on' expecting <EOF>
I am not sure how to achieve this, and the reason why I am trying this is to reflect in the target table the updates happening in the source table.
Any help in this direction would be massively appreciated,
Thanks!

Related

Issue in Pyspark code when running Glue Script

import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_replace, col
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
#sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
spark = glueContext.spark_session
#logger = glueContext.get_logger()
#logger.DEBUG('Hello Glue')
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# ####connect to database
client = boto3.client('glue', region_name='XXXXXX')
response = client.get_connection(Name='XXXXXX')
connection_properties = response['Connection']['ConnectionProperties']
URL = connection_properties['JDBC_CONNECTION_URL']
url_list = URL.split("/")
host = "{}".format(url_list[-2][:-5])
new_host=host.split('#',1)[1]
port = url_list[-2][-4:]
database = "{}".format(url_list[-1])
Oracle_Username = "{}".format(connection_properties['USERNAME'])
Oracle_Password = "{}".format(connection_properties['PASSWORD'])
#print("Oracle_Username:",Oracle_Username)
#print("Oracle_Password:",Oracle_Password)
print("Host:",host)
print("New Host:",new_host)
print("Port:",port)
print("Database:",database)
Oracle_jdbc_url="jdbc:oracle:thin:#//"+new_host+":"+port+"/"+database
print("Oracle_jdbc_url:",Oracle_jdbc_url)
source_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("dbtable", "xxgmdmadm.vendor_data").option("user", Oracle_Username).option("password", Oracle_Password).load()
#store the value in array
qrys = source_df.select("SRC_QUERY").collect()
var='select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from (select max(request_id) over(partition by org_code ) max_request_id,a.* from XXGMDMADM.MDM_DNB_RESULTS_DTL a where dnb_level =''LVL1'' and request_id in (131) ) where MAX_REQUEST_ID=request_id'
dnb_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("query", var).option("user", Oracle_Username).option("password", Oracle_Password).load()
error I am getting while populating dnb_df
An error occurred while calling o111.load. ORA-00911: invalid
character
Not getting what is wrong with this part
var='select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from
(select max(request_id) over(partition by org_code )
max_request_id,a.* from XXGMDMADM.MDM_DNB_RESULTS_DTL a where
dnb_level =''LVL1'' and request_id in (131) ) where
MAX_REQUEST_ID=request_id'
Even when I am running simple query like getting same error
var="select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from
XXGMDMADM.MDM_DNB_RESULTS_DTL"
Does query option doesn't work for Oracle ?
You can use alias while giving a query, works well:
.option("dbtable", "(SELECT * FROM schema.tablename) tbl")

Size in spark dataframe

I created a dataframe with a table of my postgres database. when i pass this command to see the number of row (df.count()), i have the error :
WARN TaskSetManager: Stage 9 contains a task of very large size (22439 KiB). The maximum recommended task size is 1000 KiB.
What does that mean ? what is the maximum size of a dataframe in spark ?
Here's the way that i connected to the postgre Database :
import configparser
import psycopg2
import pandas as pd
from queries import COUNTRY_TABLE,ACTORS_TABLE,COL_ACTOR, COL_COUNTRY
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
spark = SparkSession.builder.appName('ETL dvdrental pysaprk').getOrCreate()
def connection_db():
conn = psycopg2.connect("host=localhost dbname=demo user=postgres password=admin port=5432")
cur = conn.cursor()
return [cur, conn]
def extract_data(query):
conn_param = connection_db()
cur = conn_param[0]
conn = conn_param[1]
try:
cur.execute(query)
data = cur.fetchall()
return data
except Exception as e:
print(e)
tickets_col = ["ticket_no","book_ref", "passenger_id", "passenger_name","contact_data"]
tickets = spark.createDataFrame(extract_data("SELECT * FROM tickets")).toDF(*tickets_col)
tickets.count()
I have the warning when i execute tickets.count()

Pyspark S3A Access Denied Exception for cross account STS assume role

I setup an AWS Glue job to process S3 files present in another AWS account B. The IAM role in Account A(glue job IAM role) is using STS to assume a role in Account B which provides access to my desired files. Account B's IAM role have Trust relationship to the Glue job role in Account A. I was able to print access key and secret key, so assuming the STS is working well.
I get below error:
An error occurred while calling o83.json. com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied;
what is the right implementation of S3A connector as i get Access Denied Exception.
Here is my code:
from __future__ import print_function
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql import HiveContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import explode_outer
from pyspark.sql.functions import substring_index
from pyspark.sql.functions import input_file_name
from pyspark.sql import functions as f
import sys
import os
import os
import boto3
import sys
import errno
import time
import datetime
from datetime import timedelta, date
from pyspark.sql.functions import split
from pyspark.sql.functions import substring
from boto3.session import Session
spark = SparkSession\
.builder\
.appName("JsonInputFormat")\
.enableHiveSupport()\
.getOrCreate()\
sc = spark.sparkContext
hive_context = HiveContext(sc)
hive_context.setConf("hive.exec.dynamic.partition", "true")
hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
hive_context.setConf("hive.serialization.extend.nesting.levels","true")
sqlCtx = HiveContext(sc)
client = boto3.client('sts')
response = client.assume_role(RoleArn='ROLE_TO_ASSUME', RoleSessionName='AssumeRoleSession1')
credentials = response['Credentials']
ACCESS_KEY = credentials['AccessKeyId']
SECRET_KEY = credentials['SecretAccessKey']
print('access key is {}'.format(ACCESS_KEY))
print('secret key is {}'.format(SECRET_KEY))
print("Hadoop version: " + sc._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
session = Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3a.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3-us-east-1.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
print('path is {}'.format(path))
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
sqlCtx.sql("set spark.sql.caseSensitive=true")
yesterday = date.today() - timedelta(1)
daybefore=yesterday.strftime("%Y-%m-%d")
currentdate=time.strftime("%Y-%m-%d")
key = 'KEY={}'.format(str(daybefore))
bucket = 'BUKCET_NAME'
df_base=spark.read.json('s3a://{}/{}/*/'.format(bucket,key))
base_schema=df_base.schema
datePrefix=str(daybefore)
source='s3a://{}/{}'.format(bucket,key)
df1=spark.read.json(source,schema=base_schema)
schema=df1.schema.jsonValue()
columns_list=flatten_schema(schema)
print('columns list is {}'.format(columns_list))
df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.contents).alias("contents_flat"))
df3=df3.drop("contents")
print('df3 is {}'.format(df3))
schema4=df3.schema.jsonValue()
columns_list4=flatten_schema(schema4)
print('columns list 4 is {}'.format(columns_list4))
df5 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
print('df5 is {}'.format(df5))
schema5=df5.schema.jsonValue()
columns_list5=flatten_schema(schema5)
print('columns list 5 is {}'.format(columns_list5))
df6 = df5.select(*(col(x).alias(x.replace('contents_flat','contents')) for x in columns_list5))
print('df6 is {}'.format(df6))
schema6=df6.schema.jsonValue()
columns_list6=flatten_schema(schema6)
print('column list 6 is {}'.format(columns_list6))
df7 = df6.select(*(col(x) for x in columns_list6)) #above line edited down here
schema7=df7.schema.jsonValue()
print('schema7 is {}'.format(schema7))
columns_list7=flatten_schema(schema7)
print('columns list 7 is {}'.format(columns_list7))
df7 = df7.select(*(col(x).alias(x.replace('.','_')) for x in columns_list7))
df7=df7.select("*",explode_outer(df7.business_address_latLng_warnings).alias("business_address_latLng_warnings_flat"))
df7=df7.drop("business_address_latLng_warnings")
print('df7 is {}'.format(df7))
df8 = df7.withColumn("filename",input_file_name())
split_col = split(df8['filename'], 'short_date=')
df9 = df8.withColumn('shortfilename', split_col.getItem(1))
df_final = df9.withColumn('filedate', substring('shortfilename',1,10)).drop('shortfilename')
print('df_final is {}'.format(df_final))
df_final.write.mode('append').csv('s3://{bucket}/{folder1}/', header='true')
spark.stop()```

spark 1.3 playing with hbase error

i'm trying to create hbase table and insert using spark core (spark streaming after).
I succeeded to create the table and add data into it, even when i got this problem:
warning: Class org.apache.hadoop.hbase.classification.InterfaceAudience not found - continuing with a stub.
but when i try to count i got an error; may someone help me with the first warning and how i cant add streaming data into this table
my code is bellow:
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles
val tableName = "ziedspark"
val conf = HBaseConfiguration.create()
conf.addResource(new Path("file:///opt/cloudera/parcels/CDH-5.4.7-1.cdh5.4.7.p0.3/etc/hbase/conf.dist/hbase-site.xml"))
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(tableName)) {
print("Creating GHbase Table Creating GHbase Table Creating GHbase Table Creating GHbase Table ")
val tableDesc = new HTableDescriptor(tableName)
tableDesc.addFamily(new HColumnDescriptor("z1".getBytes()))
tableDesc.addFamily(new HColumnDescriptor("z2".getBytes()))
admin.createTable(tableDesc)
}else{
print("Table already exists!!")
}
val myTable = new HTable(conf, tableName)
for (i <- 414540 to 414545) {
var p = new Put(Bytes.toBytes(""+i))
p.add("z1".getBytes(), "name".getBytes(), Bytes.toBytes(""+(i*5)))
p.add("z1".getBytes(), "age".getBytes(), Bytes.toBytes("2016-07-01"))
p.add("z2".getBytes(), "job".getBytes(), Bytes.toBytes(""+i))
p.add("z2".getBytes(), "salary".getBytes(), Bytes.toBytes(""+i))
myTable.put(p)
}
myTable.flushCommits()
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//error here after creating the table count is not working
val count = hBaseRDD.count()
print("HBase RDD count:" + count)
System.exit(0)
Please find a similar question related to the Reading from Spark.
How to read from hbase using spark
Also in the mentioned libraries you'll get the stub to read and write in HBase.
Let me know for any more help on the same.

Spark 2.0.0 truncate from Redshift table using jdbc

Hello I am using Spark SQL(2.0.0) with Redshift where I want to truncate my tables. I am using this spark-redshift package & I want to know how I can truncate my table.Can anyone please share example of this ??
I was unable to accomplish this using Spark and the code in the spark-redshift repo that you have listed above.
I was, however, able to use AWS Lambda with psycopg2 to truncate a redshift table. Then I use boto3 to kick off my spark job via AWS Glue.
The important code below is cur.execute("truncate table yourschema.yourtable")
from __future__ import print_function
import sys
import psycopg2
import boto3
def lambda_handler(event, context):
db_database = "your_redshift_db_name"
db_user = "your_user_name"
db_password = "your_password"
db_port = "5439"
db_host = "your_redshift.hostname.us-west-2.redshift.amazonaws.com"
try:
print("attempting to connect...")
conn = psycopg2.connect(dbname=db_database, user=db_user, password=db_password, host=db_host, port=db_port)
print("connected...")
conn.autocommit = True
cur = conn.cursor()
count_sql = "select count(pivotid) from yourschema.yourtable"
cur.execute(count_sql)
results = cur.fetchone()
print("countBefore: ", results[0])
countOfPivots = results[0]
if countOfPivots > 0:
cur.execute("truncate table yourschema.yourtable")
print("truncated yourschema.yourtable")
cur.execute(count_sql)
results = cur.fetchone()
print("countAfter: ", results[0])
cur.close()
conn.close()
glueClient = boto3.client("glue")
startTriiggerResponse = glueClient.start_trigger(Name="your-awsglue-ondemand-trigger")
print("startedTrigger:", startTriiggerResponse.Name)
return results
except Exception as e:
print(e)
raise e
You need to specify the mode to the library before calling save. For example:
my_dataframe.write
.format("com.databricks.spark.redshift")
.option("url", "jdbc:redshift://my_cluster.qwertyuiop.eu-west-1.redshift.amazonaws.com:5439/my_database?user=my_user&password=my_password")
.option("dbtable", "my_table")
.option("tempdir", "s3://my-bucket")
.option("diststyle", "KEY")
.option("distkey", "dist_key")
.option("sortkeyspec", "COMPOUND SORTKEY(key_1, key_2)")
.option("extracopyoptions", "TRUNCATECOLUMNS COMPUPDATE OFF STATUPDATE OFF")
.mode("overwrite") // "append" / "error"
.save()

Resources