Issue in Pyspark code when running Glue Script - apache-spark

import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_replace, col
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
#sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
spark = glueContext.spark_session
#logger = glueContext.get_logger()
#logger.DEBUG('Hello Glue')
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# ####connect to database
client = boto3.client('glue', region_name='XXXXXX')
response = client.get_connection(Name='XXXXXX')
connection_properties = response['Connection']['ConnectionProperties']
URL = connection_properties['JDBC_CONNECTION_URL']
url_list = URL.split("/")
host = "{}".format(url_list[-2][:-5])
new_host=host.split('#',1)[1]
port = url_list[-2][-4:]
database = "{}".format(url_list[-1])
Oracle_Username = "{}".format(connection_properties['USERNAME'])
Oracle_Password = "{}".format(connection_properties['PASSWORD'])
#print("Oracle_Username:",Oracle_Username)
#print("Oracle_Password:",Oracle_Password)
print("Host:",host)
print("New Host:",new_host)
print("Port:",port)
print("Database:",database)
Oracle_jdbc_url="jdbc:oracle:thin:#//"+new_host+":"+port+"/"+database
print("Oracle_jdbc_url:",Oracle_jdbc_url)
source_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("dbtable", "xxgmdmadm.vendor_data").option("user", Oracle_Username).option("password", Oracle_Password).load()
#store the value in array
qrys = source_df.select("SRC_QUERY").collect()
var='select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from (select max(request_id) over(partition by org_code ) max_request_id,a.* from XXGMDMADM.MDM_DNB_RESULTS_DTL a where dnb_level =''LVL1'' and request_id in (131) ) where MAX_REQUEST_ID=request_id'
dnb_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("query", var).option("user", Oracle_Username).option("password", Oracle_Password).load()
error I am getting while populating dnb_df
An error occurred while calling o111.load. ORA-00911: invalid
character
Not getting what is wrong with this part
var='select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from
(select max(request_id) over(partition by org_code )
max_request_id,a.* from XXGMDMADM.MDM_DNB_RESULTS_DTL a where
dnb_level =''LVL1'' and request_id in (131) ) where
MAX_REQUEST_ID=request_id'
Even when I am running simple query like getting same error
var="select DNB_RESULTS_DTL_ID,REQUEST_ID,source_id,dnb_level from
XXGMDMADM.MDM_DNB_RESULTS_DTL"
Does query option doesn't work for Oracle ?

You can use alias while giving a query, works well:
.option("dbtable", "(SELECT * FROM schema.tablename) tbl")

Related

Size in spark dataframe

I created a dataframe with a table of my postgres database. when i pass this command to see the number of row (df.count()), i have the error :
WARN TaskSetManager: Stage 9 contains a task of very large size (22439 KiB). The maximum recommended task size is 1000 KiB.
What does that mean ? what is the maximum size of a dataframe in spark ?
Here's the way that i connected to the postgre Database :
import configparser
import psycopg2
import pandas as pd
from queries import COUNTRY_TABLE,ACTORS_TABLE,COL_ACTOR, COL_COUNTRY
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
spark = SparkSession.builder.appName('ETL dvdrental pysaprk').getOrCreate()
def connection_db():
conn = psycopg2.connect("host=localhost dbname=demo user=postgres password=admin port=5432")
cur = conn.cursor()
return [cur, conn]
def extract_data(query):
conn_param = connection_db()
cur = conn_param[0]
conn = conn_param[1]
try:
cur.execute(query)
data = cur.fetchall()
return data
except Exception as e:
print(e)
tickets_col = ["ticket_no","book_ref", "passenger_id", "passenger_name","contact_data"]
tickets = spark.createDataFrame(extract_data("SELECT * FROM tickets")).toDF(*tickets_col)
tickets.count()
I have the warning when i execute tickets.count()

Perform INSERT INTO ... SELECT in AWS GLUE

The following script populates a target table with the data fetched from a source table using pyspark.sql and runs without problems in AWS Glue:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
users.toDF().createOrReplaceTempView("users")
query_users = """
SELECT U.id
, signup_from
FROM users AS U
"""
users_df = spark.sql(query_users)
users_dynamicframe = DynamicFrame.fromDF(
users_df.repartition(1), glueContext, "users_dynamicframe"
)
users_output = glueContext.write_dynamic_frame.from_catalog(
frame=users_dynamicframe,
database="target",
table_name="target_users",
transformation_ctx="users_output",
)
job.commit()
Now, I would like to perform an INSERT INTO SELECT ... ON DUPLICATE KEY UPDATE ...
and I wrote the following script:
source_users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
target_users = glueContext.create_dynamic_frame.from_catalog(
database = "target", table_name = "target_users"
)
source_users.toDF().createOrReplaceTempView("source_users")
target_users.toDF().createOrReplaceTempView("target_users")
query = """
INSERT INTO target_users
SELECT U.id
, U.user_type
FROM source_users
on duplicate key update id=target_users.id
"""
target_output = spark.sql(query)
job.commit()
which returns the following
ParseException: "\nmismatched input 'on' expecting <EOF>
I am not sure how to achieve this, and the reason why I am trying this is to reflect in the target table the updates happening in the source table.
Any help in this direction would be massively appreciated,
Thanks!

Pyspark S3A Access Denied Exception for cross account STS assume role

I setup an AWS Glue job to process S3 files present in another AWS account B. The IAM role in Account A(glue job IAM role) is using STS to assume a role in Account B which provides access to my desired files. Account B's IAM role have Trust relationship to the Glue job role in Account A. I was able to print access key and secret key, so assuming the STS is working well.
I get below error:
An error occurred while calling o83.json. com.amazon.ws.emr.hadoop.fs.shaded.com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied;
what is the right implementation of S3A connector as i get Access Denied Exception.
Here is my code:
from __future__ import print_function
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql import HiveContext
from pyspark.sql.functions import explode
from pyspark.sql.functions import explode_outer
from pyspark.sql.functions import substring_index
from pyspark.sql.functions import input_file_name
from pyspark.sql import functions as f
import sys
import os
import os
import boto3
import sys
import errno
import time
import datetime
from datetime import timedelta, date
from pyspark.sql.functions import split
from pyspark.sql.functions import substring
from boto3.session import Session
spark = SparkSession\
.builder\
.appName("JsonInputFormat")\
.enableHiveSupport()\
.getOrCreate()\
sc = spark.sparkContext
hive_context = HiveContext(sc)
hive_context.setConf("hive.exec.dynamic.partition", "true")
hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
hive_context.setConf("hive.serialization.extend.nesting.levels","true")
sqlCtx = HiveContext(sc)
client = boto3.client('sts')
response = client.assume_role(RoleArn='ROLE_TO_ASSUME', RoleSessionName='AssumeRoleSession1')
credentials = response['Credentials']
ACCESS_KEY = credentials['AccessKeyId']
SECRET_KEY = credentials['SecretAccessKey']
print('access key is {}'.format(ACCESS_KEY))
print('secret key is {}'.format(SECRET_KEY))
print("Hadoop version: " + sc._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion())
session = Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3a.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3-us-east-1.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
print('path is {}'.format(path))
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
sqlCtx.sql("set spark.sql.caseSensitive=true")
yesterday = date.today() - timedelta(1)
daybefore=yesterday.strftime("%Y-%m-%d")
currentdate=time.strftime("%Y-%m-%d")
key = 'KEY={}'.format(str(daybefore))
bucket = 'BUKCET_NAME'
df_base=spark.read.json('s3a://{}/{}/*/'.format(bucket,key))
base_schema=df_base.schema
datePrefix=str(daybefore)
source='s3a://{}/{}'.format(bucket,key)
df1=spark.read.json(source,schema=base_schema)
schema=df1.schema.jsonValue()
columns_list=flatten_schema(schema)
print('columns list is {}'.format(columns_list))
df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.contents).alias("contents_flat"))
df3=df3.drop("contents")
print('df3 is {}'.format(df3))
schema4=df3.schema.jsonValue()
columns_list4=flatten_schema(schema4)
print('columns list 4 is {}'.format(columns_list4))
df5 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
print('df5 is {}'.format(df5))
schema5=df5.schema.jsonValue()
columns_list5=flatten_schema(schema5)
print('columns list 5 is {}'.format(columns_list5))
df6 = df5.select(*(col(x).alias(x.replace('contents_flat','contents')) for x in columns_list5))
print('df6 is {}'.format(df6))
schema6=df6.schema.jsonValue()
columns_list6=flatten_schema(schema6)
print('column list 6 is {}'.format(columns_list6))
df7 = df6.select(*(col(x) for x in columns_list6)) #above line edited down here
schema7=df7.schema.jsonValue()
print('schema7 is {}'.format(schema7))
columns_list7=flatten_schema(schema7)
print('columns list 7 is {}'.format(columns_list7))
df7 = df7.select(*(col(x).alias(x.replace('.','_')) for x in columns_list7))
df7=df7.select("*",explode_outer(df7.business_address_latLng_warnings).alias("business_address_latLng_warnings_flat"))
df7=df7.drop("business_address_latLng_warnings")
print('df7 is {}'.format(df7))
df8 = df7.withColumn("filename",input_file_name())
split_col = split(df8['filename'], 'short_date=')
df9 = df8.withColumn('shortfilename', split_col.getItem(1))
df_final = df9.withColumn('filedate', substring('shortfilename',1,10)).drop('shortfilename')
print('df_final is {}'.format(df_final))
df_final.write.mode('append').csv('s3://{bucket}/{folder1}/', header='true')
spark.stop()```

spark 1.3 playing with hbase error

i'm trying to create hbase table and insert using spark core (spark streaming after).
I succeeded to create the table and add data into it, even when i got this problem:
warning: Class org.apache.hadoop.hbase.classification.InterfaceAudience not found - continuing with a stub.
but when i try to count i got an error; may someone help me with the first warning and how i cant add streaming data into this table
my code is bellow:
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles
val tableName = "ziedspark"
val conf = HBaseConfiguration.create()
conf.addResource(new Path("file:///opt/cloudera/parcels/CDH-5.4.7-1.cdh5.4.7.p0.3/etc/hbase/conf.dist/hbase-site.xml"))
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(tableName)) {
print("Creating GHbase Table Creating GHbase Table Creating GHbase Table Creating GHbase Table ")
val tableDesc = new HTableDescriptor(tableName)
tableDesc.addFamily(new HColumnDescriptor("z1".getBytes()))
tableDesc.addFamily(new HColumnDescriptor("z2".getBytes()))
admin.createTable(tableDesc)
}else{
print("Table already exists!!")
}
val myTable = new HTable(conf, tableName)
for (i <- 414540 to 414545) {
var p = new Put(Bytes.toBytes(""+i))
p.add("z1".getBytes(), "name".getBytes(), Bytes.toBytes(""+(i*5)))
p.add("z1".getBytes(), "age".getBytes(), Bytes.toBytes("2016-07-01"))
p.add("z2".getBytes(), "job".getBytes(), Bytes.toBytes(""+i))
p.add("z2".getBytes(), "salary".getBytes(), Bytes.toBytes(""+i))
myTable.put(p)
}
myTable.flushCommits()
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//error here after creating the table count is not working
val count = hBaseRDD.count()
print("HBase RDD count:" + count)
System.exit(0)
Please find a similar question related to the Reading from Spark.
How to read from hbase using spark
Also in the mentioned libraries you'll get the stub to read and write in HBase.
Let me know for any more help on the same.

Zeppelin: muptiple SparkContexts issue

I tried to run the folloiwng simple code in Zeppelin:
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
System.clearProperty("spark.driver.port")
System.clearProperty("spark.hostPort")
def maxWaitTimeMillis: Int = 20000
def actuallyWait: Boolean = false
val conf = new SparkConf().setMaster("local[2]").setAppName("Streaming test")
var sc = new SparkContext(conf)
def batchDuration: Duration = Seconds(1)
val ssc = new StreamingContext(sc, batchDuration)
This is the output in Zeppelin:
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.mllib.regression.LabeledPoint
calculateRMSE: (output: org.apache.spark.streaming.dstream.DStream[(Double, Double)], n: org.apache.spark.streaming.dstream.DStream[Long])Double
res50: String = null
res51: String = null
maxWaitTimeMillis: Int
actuallyWait: Boolean
conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf#1daf4e42
org.apache.spark.SparkException: Only one SparkContext may be running in this JVM (see SPARK-2243). To ignore this error, set spark.driver.allowMultipleContexts = true. The currently running SparkContext was created at:
org.apache.spark.SparkContext.<init>(SparkContext.scala:82)
org.apache.zeppelin.spark.SparkInterpreter.createSparkContext(SparkInterpreter.java:356)
org.apache.zeppelin.spark.SparkInterpreter.getSparkContext(SparkInterpreter.java:150)
org.apache.zeppelin.spark.SparkInterpreter.open(SparkInterpreter.java:525)
org.apache.zeppelin.interpreter.ClassloaderInterpreter.open(ClassloaderInterpreter.java:74)
org.apache.zeppelin.interpreter.LazyOpenInterpreter.open(LazyOpenInterpreter.java:68)
org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:92)
org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:345)
org.apache.zeppelin.scheduler.Job.run(Job.java:176)
org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:139)
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
java.util.concurrent.FutureTask.run(FutureTask.java:266)
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
java.lang.Thread.run(Thread.java:745)
at org.apache.spark.SparkContext$$anonfun$assertNoOtherContextIsRunning$1.apply(SparkContext.scala:2257)
at org.apache.spark.SparkContext$$anonfun$assertNoOtherContextIsRunning$1.apply(SparkContext.scala:2239)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.SparkContext$.assertNoOtherContextIsRunning(SparkContext.scala:2239)
at org.apache.spark.SparkContext$.markPartiallyConstructed(SparkContext.scala:2312)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:91)
Why does it say that I have multiple SparkContexts running? If I do not add the line var sc = new SparkContext(conf), then sc is null, so it's not created.
You can't use multiple SparkContexts in Zeppelin. It's one of his limitations since he's creating actually a webhook to a SparkContext.
If you wish to set up the your SparkConf in Zeppelin, the easiest way is to set those properties in the Interpreter menu and restart the interpreter to take those configuration in your SparkContext.
Now you can go back to your notebook and test your code :
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
def maxWaitTimeMillis: Int = 20000
def actuallyWait: Boolean = false
def batchDuration: Duration = Seconds(1)
val ssc = new StreamingContext(sc, batchDuration)
More on that here.

Resources