Spark 2.0.0 truncate from Redshift table using jdbc - apache-spark

Hello I am using Spark SQL(2.0.0) with Redshift where I want to truncate my tables. I am using this spark-redshift package & I want to know how I can truncate my table.Can anyone please share example of this ??

I was unable to accomplish this using Spark and the code in the spark-redshift repo that you have listed above.
I was, however, able to use AWS Lambda with psycopg2 to truncate a redshift table. Then I use boto3 to kick off my spark job via AWS Glue.
The important code below is cur.execute("truncate table yourschema.yourtable")
from __future__ import print_function
import sys
import psycopg2
import boto3
def lambda_handler(event, context):
db_database = "your_redshift_db_name"
db_user = "your_user_name"
db_password = "your_password"
db_port = "5439"
db_host = "your_redshift.hostname.us-west-2.redshift.amazonaws.com"
try:
print("attempting to connect...")
conn = psycopg2.connect(dbname=db_database, user=db_user, password=db_password, host=db_host, port=db_port)
print("connected...")
conn.autocommit = True
cur = conn.cursor()
count_sql = "select count(pivotid) from yourschema.yourtable"
cur.execute(count_sql)
results = cur.fetchone()
print("countBefore: ", results[0])
countOfPivots = results[0]
if countOfPivots > 0:
cur.execute("truncate table yourschema.yourtable")
print("truncated yourschema.yourtable")
cur.execute(count_sql)
results = cur.fetchone()
print("countAfter: ", results[0])
cur.close()
conn.close()
glueClient = boto3.client("glue")
startTriiggerResponse = glueClient.start_trigger(Name="your-awsglue-ondemand-trigger")
print("startedTrigger:", startTriiggerResponse.Name)
return results
except Exception as e:
print(e)
raise e

You need to specify the mode to the library before calling save. For example:
my_dataframe.write
.format("com.databricks.spark.redshift")
.option("url", "jdbc:redshift://my_cluster.qwertyuiop.eu-west-1.redshift.amazonaws.com:5439/my_database?user=my_user&password=my_password")
.option("dbtable", "my_table")
.option("tempdir", "s3://my-bucket")
.option("diststyle", "KEY")
.option("distkey", "dist_key")
.option("sortkeyspec", "COMPOUND SORTKEY(key_1, key_2)")
.option("extracopyoptions", "TRUNCATECOLUMNS COMPUPDATE OFF STATUPDATE OFF")
.mode("overwrite") // "append" / "error"
.save()

Related

Size in spark dataframe

I created a dataframe with a table of my postgres database. when i pass this command to see the number of row (df.count()), i have the error :
WARN TaskSetManager: Stage 9 contains a task of very large size (22439 KiB). The maximum recommended task size is 1000 KiB.
What does that mean ? what is the maximum size of a dataframe in spark ?
Here's the way that i connected to the postgre Database :
import configparser
import psycopg2
import pandas as pd
from queries import COUNTRY_TABLE,ACTORS_TABLE,COL_ACTOR, COL_COUNTRY
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
spark = SparkSession.builder.appName('ETL dvdrental pysaprk').getOrCreate()
def connection_db():
conn = psycopg2.connect("host=localhost dbname=demo user=postgres password=admin port=5432")
cur = conn.cursor()
return [cur, conn]
def extract_data(query):
conn_param = connection_db()
cur = conn_param[0]
conn = conn_param[1]
try:
cur.execute(query)
data = cur.fetchall()
return data
except Exception as e:
print(e)
tickets_col = ["ticket_no","book_ref", "passenger_id", "passenger_name","contact_data"]
tickets = spark.createDataFrame(extract_data("SELECT * FROM tickets")).toDF(*tickets_col)
tickets.count()
I have the warning when i execute tickets.count()

how to convert spark sql query result to dataframe python

How to get spark.sql query result to dataframe , when i run below code line it's giving object is there any way to read spark.sql give dataframe results
i tried below code but it's give object
df = spark_session.sql() it's give object
I have mentioned below steps which make you clarity about how we get the data from rdms using spark SQL and store into dataframe.This expample is best practice to write the spark code for the production scripts.
'''
DataFrame creation scripts
#author: Mr. Ravi Kumar
'''
def get_session():
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('basic1').getOrCreate()
sc=spark.sparkContext
return sc, spark
#mysql connection details
driver = "com.mysql.jdbc.Driver"
url = "jdbc:mysql://127.0.0.1:3306/test"
user = "root"
pwd = "India#123"
#Building connection and reading data from mysql
def read_data(spark, sc):
sourceDf = spark.read.format("jdbc").option("driver", driver)\
.option("url", url)\
.option("dbtable", "employee")\
.option("user", user)\
.option("password", pwd)\
.load()
print("Bulid mysql connection successfully ! ")
return sourceDf
#validating the data
def data_disp(spark,sc):
df=read_data(spark, sc)
print("***************************Data Preview*******************************************")
df.show(truncate=0)
#2nd highest employee Job wise
def secondHighest(spark,sc):
import pyspark.sql.window as W
import pyspark.sql.functions as F
import pyspark.sql.types as T
sourceDf=read_data(spark,sc)
#windownspec
v=W.Window.partitionBy(sourceDf["empid"]).orderBy(sourceDf["salary"].desc())
highest=sourceDf.withColumn("2nd_Highest", F.dense_rank().over(v))
return highest
#writing back after processing
def write_mysql(spark, sc):
output=secondHighest(spark, sc)
output.write.format("jdbc").option("driver", driver)\
.option("url", url)\
.option("dbtable", "Second_highest")\
.option("user", user)\
.option("password", pwd)\
.save()
#main function
if __name__ == '__main__':
sc, spark=get_session()
read_data(spark,sc)
data_disp(spark,sc)
secondHighest(spark,sc)
write_mysql(spark, sc)

How to load data from a connection string with vaex package?

If I have a table on my server and I am producing a connection string to it, how can I, using Vaex, load it to a dataframe?
Here is what I am doing but with Pandas:
from sqlalchemy import types, create_engine, text
import pandas as pd
import pymysql
def connect_to_data(driver='mysql+pymysql://', conn_string=''):
try:
conn = create_engine(driver + conn_string)
print("MySQL Connection Successfull!")
except Exception as err:
print("MySQL Connection Failed!")
print(err)
return conn
# Connect to the db:
conn_string = 'xxxxxxxx'
conn = connect_to_data(conn_string=conn_string)
# Get all requests from the db:
query = '''SELECT * FROM table_name'''
result = conn.execute(text(query))
# Desired dataframe:
df = pd.read_sql_query(query, conn)
How can I do the same with Vaex (because of it's high performance)?
For now at least, you can't do it directly. But vaex can easily read a pandas dataframe so you can
# Following your example..
pandas_df = pd.read_sql_query(query, conn)
df = vaex.from_pandas(pandas_df)

Perform INSERT INTO ... SELECT in AWS GLUE

The following script populates a target table with the data fetched from a source table using pyspark.sql and runs without problems in AWS Glue:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
users.toDF().createOrReplaceTempView("users")
query_users = """
SELECT U.id
, signup_from
FROM users AS U
"""
users_df = spark.sql(query_users)
users_dynamicframe = DynamicFrame.fromDF(
users_df.repartition(1), glueContext, "users_dynamicframe"
)
users_output = glueContext.write_dynamic_frame.from_catalog(
frame=users_dynamicframe,
database="target",
table_name="target_users",
transformation_ctx="users_output",
)
job.commit()
Now, I would like to perform an INSERT INTO SELECT ... ON DUPLICATE KEY UPDATE ...
and I wrote the following script:
source_users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
target_users = glueContext.create_dynamic_frame.from_catalog(
database = "target", table_name = "target_users"
)
source_users.toDF().createOrReplaceTempView("source_users")
target_users.toDF().createOrReplaceTempView("target_users")
query = """
INSERT INTO target_users
SELECT U.id
, U.user_type
FROM source_users
on duplicate key update id=target_users.id
"""
target_output = spark.sql(query)
job.commit()
which returns the following
ParseException: "\nmismatched input 'on' expecting <EOF>
I am not sure how to achieve this, and the reason why I am trying this is to reflect in the target table the updates happening in the source table.
Any help in this direction would be massively appreciated,
Thanks!

Getting Error while trying to retrieve text for error ORA-01804 while executing aws python lambda linux

I am trying to execute below lambda function from aws lambda, I used python 3.7 as runtime environment.
import cx_Oracle
import os
import logging
import boto3
from botocore.exceptions import ClientError
from base64 import b64decode
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def lambda_handler(event, context):
logger.info('begin lambda_handler')
os.environ['LD_LIBRARY_PATH'] = os.getcwd()
dsn = cx_Oracle.makedsn("hostname", 1521, service_name="servicename")
con = cx_Oracle.connect("userid", "passwod", dsn)
cur = con.cursor()
#logger.info('username: ' + username)
#logger.info('host: ' + host)
sql = """SELECT COUNT(*) AS TEST_COUNT FROM DUAL"""
cur.execute(sql)
columns = [i[0] for i in cur.description]
rows = [dict(zip(columns, row)) for row in cur]
logger.info(rows)
con.close()
logger.info('end lambda_handler')
return "Successfully connected to oracle."
But when i execute above lambda i get below error.
Error while trying to retrieve text for error ORA-01804
Any help on this?
Check if your Oracle instant version is the same as your database. That can also lead to this error.
I tried using the latest oracle instant client v21.1 and it spews the same error like this.
It turns out the server that hosts the database is using v11.2 so I had to download the v11.2 to match it.

Resources