AWS EMR Notebook running core nodes just once - apache-spark

I'm copying data from a MySQL db using spark:
def load_table(table):
print(table)
df = (
spark.read
.format("jdbc")
.option("url", db_url)
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", table)
.option("user", db_user)
.option("password", db_password)
.load()
)
df.write.format("parquet").mode("overwrite").save('s3://MY-BUCKET-NAME/raw/DATABASE/{tb}/{db}/'.format(tb = table_name, db = db_name))
this piece of code ir running ok but to make it faster, I'm using threads:
from threading import Thread
from queue import Queue
q = Queue()
# print(table_list)
worker_count = 1
def run_tasks(function, q):
while not q.empty():
value = q.get()
function(value)
q.task_done()
for table in db_tables:
q.put(table)
# threads = []
# for i in range(worker_count):
# t=Thread(target=run_tasks, args=(load_table, q))
# t.daemon = True
# t.start()
t = Thread(target=run_tasks, args=(load_table, q), daemon=True).start()
print('Running load')
q.join()
print('Load completed')
The problem is: It's running OK, but just 4 times (I have a spark cluster with 1 master and 4 core nodes). Why my core nodes are not getting another job when It finishes?

Related

Memory leak in very simple spark streaming app (reading file stream and write to delta)?

I have a very simple spark streaming app that reads parquet data from s3 and upsert to delta table:
import boto3
import os
from pathlib import Path
from delta import *
from delta.tables import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
app_name = "my_app"
spark = SparkSession.builder.appName(app_name).config("spark.databricks.io.cache.enabled", "true").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
env = os.environ["ENVIRONMENT"]
landing_bucket = f"s3a://awesommmme-data-landing-zone-{env}" # data_real/landing
cleansing_bucket = f"s3a://awesommmme-data-cleansing-zone-{env}"
checkpoint_bucket = f"s3a://awesommmme-data-spark-checkpoints-{env}"
landing_data_label_path = os.path.join(landing_bucket, "contract_label", "data")
cleansing_data_label_path = os.path.join(cleansing_bucket, "contract_label", "data")
_label_logic_schema = [
StructField("token_address", StringType()),
StructField("token_type", StringType()),
]
_raw_data_schema = _label_logic_schema + [
StructField("update_dt", TimestampType()),
StructField("year", StringType()),
StructField("month", StringType()),
StructField("day", StringType()),
StructField("hour", StringType()),
StructField("minute", StringType()),
]
raw_data_schema = StructType(_raw_data_schema)
deltaTable = None
def upsert_to_cleaned_delta(pdf, batchId):
global deltaTable
if deltaTable is None:
try:
deltaTable = DeltaTable.forPath(spark, cleansing_data_label_path)
except Exception as e:
import logging
logging.error(str(e))
pdf.write.format("delta").partitionBy("year", "month", "day", "hour", "minute").save(
cleansing_data_label_path
)
deltaTable = DeltaTable.forPath(spark, cleansing_data_label_path)
return
deltaTable.alias("old_data").merge(
pdf.alias("new_data"), "old_data.token_address = new_data.token_address"
).whenMatchedUpdateAll(
).whenNotMatchedInsertAll().execute()
def main():
_ = (
spark.readStream.schema(raw_data_schema)
.parquet(landing_data_label_path)
.writeStream.format("delta")
.option("maxFilesPerTrigger", 10)
.option("checkpointLocation", "MY_CHECKPOINT_PATH")
.foreachBatch(lambda pdf, batch_id: upsert_to_cleaned_delta(pdf, batch_id))
.trigger(processingTime="1 second")
.outputMode("append")
.start()
)
spark.streams.awaitAnyTermination()
if __name__ == "__main__":
main()
I run this app in spark + kubernetes cluster but when I monitored resources, there is memory leak:
container
pod
As time goes, it reached to limit and pods will be dead.
Here is values.yaml used in deploying spark app using helm:
app: data-pipeline
applicationId: myapp
name: myapp-data
namespace: "{{ .Release.Namespace }}"
environment: dev
sparkConf:
spark.executor.heartbeatInterval: "600s"
spark.network.timeout: "3600s"
deps:
jars:
- delta-core_2.12-2.0.1.jar
restartPolicy:
type: Always
driver:
annotations: {}
coreRequest: 100m
coreLimit: 500m
memory: 3g
executor:
coreRequest: 100m
coreLimit: 500m
instances: 1
memory: 2g
One things to note
Size of files that spark app reads is really small(Kbytes) and very seldeomly created (one file per 2~3 hours)
The delta table size is about 190MB, about 700,000 rows.
I have no idea why a memory leak occurs...

how to convert spark sql query result to dataframe python

How to get spark.sql query result to dataframe , when i run below code line it's giving object is there any way to read spark.sql give dataframe results
i tried below code but it's give object
df = spark_session.sql() it's give object
I have mentioned below steps which make you clarity about how we get the data from rdms using spark SQL and store into dataframe.This expample is best practice to write the spark code for the production scripts.
'''
DataFrame creation scripts
#author: Mr. Ravi Kumar
'''
def get_session():
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('basic1').getOrCreate()
sc=spark.sparkContext
return sc, spark
#mysql connection details
driver = "com.mysql.jdbc.Driver"
url = "jdbc:mysql://127.0.0.1:3306/test"
user = "root"
pwd = "India#123"
#Building connection and reading data from mysql
def read_data(spark, sc):
sourceDf = spark.read.format("jdbc").option("driver", driver)\
.option("url", url)\
.option("dbtable", "employee")\
.option("user", user)\
.option("password", pwd)\
.load()
print("Bulid mysql connection successfully ! ")
return sourceDf
#validating the data
def data_disp(spark,sc):
df=read_data(spark, sc)
print("***************************Data Preview*******************************************")
df.show(truncate=0)
#2nd highest employee Job wise
def secondHighest(spark,sc):
import pyspark.sql.window as W
import pyspark.sql.functions as F
import pyspark.sql.types as T
sourceDf=read_data(spark,sc)
#windownspec
v=W.Window.partitionBy(sourceDf["empid"]).orderBy(sourceDf["salary"].desc())
highest=sourceDf.withColumn("2nd_Highest", F.dense_rank().over(v))
return highest
#writing back after processing
def write_mysql(spark, sc):
output=secondHighest(spark, sc)
output.write.format("jdbc").option("driver", driver)\
.option("url", url)\
.option("dbtable", "Second_highest")\
.option("user", user)\
.option("password", pwd)\
.save()
#main function
if __name__ == '__main__':
sc, spark=get_session()
read_data(spark,sc)
data_disp(spark,sc)
secondHighest(spark,sc)
write_mysql(spark, sc)

Spark 2.0.0 truncate from Redshift table using jdbc

Hello I am using Spark SQL(2.0.0) with Redshift where I want to truncate my tables. I am using this spark-redshift package & I want to know how I can truncate my table.Can anyone please share example of this ??
I was unable to accomplish this using Spark and the code in the spark-redshift repo that you have listed above.
I was, however, able to use AWS Lambda with psycopg2 to truncate a redshift table. Then I use boto3 to kick off my spark job via AWS Glue.
The important code below is cur.execute("truncate table yourschema.yourtable")
from __future__ import print_function
import sys
import psycopg2
import boto3
def lambda_handler(event, context):
db_database = "your_redshift_db_name"
db_user = "your_user_name"
db_password = "your_password"
db_port = "5439"
db_host = "your_redshift.hostname.us-west-2.redshift.amazonaws.com"
try:
print("attempting to connect...")
conn = psycopg2.connect(dbname=db_database, user=db_user, password=db_password, host=db_host, port=db_port)
print("connected...")
conn.autocommit = True
cur = conn.cursor()
count_sql = "select count(pivotid) from yourschema.yourtable"
cur.execute(count_sql)
results = cur.fetchone()
print("countBefore: ", results[0])
countOfPivots = results[0]
if countOfPivots > 0:
cur.execute("truncate table yourschema.yourtable")
print("truncated yourschema.yourtable")
cur.execute(count_sql)
results = cur.fetchone()
print("countAfter: ", results[0])
cur.close()
conn.close()
glueClient = boto3.client("glue")
startTriiggerResponse = glueClient.start_trigger(Name="your-awsglue-ondemand-trigger")
print("startedTrigger:", startTriiggerResponse.Name)
return results
except Exception as e:
print(e)
raise e
You need to specify the mode to the library before calling save. For example:
my_dataframe.write
.format("com.databricks.spark.redshift")
.option("url", "jdbc:redshift://my_cluster.qwertyuiop.eu-west-1.redshift.amazonaws.com:5439/my_database?user=my_user&password=my_password")
.option("dbtable", "my_table")
.option("tempdir", "s3://my-bucket")
.option("diststyle", "KEY")
.option("distkey", "dist_key")
.option("sortkeyspec", "COMPOUND SORTKEY(key_1, key_2)")
.option("extracopyoptions", "TRUNCATECOLUMNS COMPUPDATE OFF STATUPDATE OFF")
.mode("overwrite") // "append" / "error"
.save()

Spark Streaming - updateStateByKey and caching data

I have a problem with using updateStateByKey function and caching some big data at the same time. Here is a example.
Lets say I get data (lastname,age) from kafka. I want to keep actual age for every person so I use updateStateByKey. Also I want to know name of every person so I join output with external table (lastname,name) e.g. from Hive. Lets assume it's really big table, so I don't want to load it in every batch. And there's a problem.
All works well, when I load table in every batch, but when I try to cache table, StreamingContext doesn't start. I also tried to use registerTempTable and later join data with sql but i got the same error.
Seems like the problem is checkpoint needed by updateStateByKey. When I remove updateStateByKey and leave checkpoint i got error, but when I remove both it works.
Error I'm getting: pastebin
Here is code:
import sys
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
# function to keep actual state
def updateFunc(channel, actualChannel):
if (actualChannel is None or not channel is None):
try:
actualChannel = channel[-1]
except Exception:
pass
if channel is None:
channel = actualChannel
return actualChannel
def splitFunc(row):
row = row.strip()
lname,age = row.split()
return (lname,age)
def createContext(brokers,topics):
# some conf
conf = SparkConf().setAppName(appName).set("spark.streaming.stopGracefullyOnShutdown","true").set("spark.dynamicAllocation.enabled","false").\
set("spark.serializer","org.apache.spark.serializer.KryoSerializer").set("spark.sql.shuffle.partitions",'100')
# create SparkContext
sc = SparkContext(conf=conf)
# create HiveContext
sqlContext = HiveContext(sc)
# create Streaming Context
ssc = StreamingContext(sc, 5)
# read big_df and cache (not work, Streaming Context not start)
big_df = sqlContext.sql('select lastname,name from `default`.`names`')
big_df.cache().show(10)
# join table
def joinTable(time,rdd):
if rdd.isEmpty()==False:
df = HiveContext.getOrCreate(SparkContext.getOrCreate()).createDataFrame(rdd,['lname','age'])
# read big_df (work)
#big_df = HiveContext.getOrCreate(SparkContext.getOrCreate()).sql('select lastname,name from `default`.`names`')
# join DMS
df2 = df.join(big_df,df.lname == big_df.lastname,"left_outer")
return df2.map(lambda row:row)
# streaming
kvs = KafkaUtils.createDirectStream(ssc, [topics], {'metadata.broker.list': brokers})
kvs.map(lambda (k,v): splitFunc(v)).updateStateByKey(updateFunc).transform(joinTable).pprint()
return ssc
if __name__ == "__main__":
appName="SparkCheckpointUpdateSate"
if len(sys.argv) != 3:
print("Usage: SparkCheckpointUpdateSate.py <broker_list> <topic>")
exit(-1)
brokers, topics = sys.argv[1:]
# getOrCreate Context
checkpoint = 'SparkCheckpoint/checkpoint'
ssc = StreamingContext.getOrCreate(checkpoint,lambda: createContext(brokers,topics))
# start streaming
ssc.start()
ssc.awaitTermination()
Can you tell me how to properly cache data when checkpoint is enabled? Maybe there is some workaround I don't know.
Spark ver. 1.6
I get this working using lazily instantiated global instance of big_df. Something like that is done in recoverable_network_wordcount.py
.
def getBigDf():
if ('bigdf' not in globals()):
globals()['bigdf'] = HiveContext.getOrCreate(SparkContext.getOrCreate()).sql('select lastname,name from `default`.`names`')
return globals()['bigdf']
def createContext(brokers,topics):
...
def joinTable(time,rdd):
...
# read big_df (work)
big_df = getBigDF()
# join DMS
df2 = df.join(big_df,df.lname == big_df.lastname,"left_outer")
return df2.map(lambda row:row)
...
Seems like in streaming all data must be cached inside streaming processing, not before.

Only one SparkContext may be running in this JVM - Flask

Can anyone provide guidance on why this simple Flask app complains about Only one SparkContext may be running in this JVM. I'm not attempting to load more than one context, obviously.
Code:
import flask
from pyspark import SparkContext
from operator import itemgetter
app = flask.Flask(__name__)
#app.route('/')
def homepage():
return 'Example: /dt/140'
#app.route('/dt/<int:delaythreshold>')
def dt(delaythreshold):
global flights_rdd
flights_dict = \
flights_rdd \
.filter( lambda (day, delay): delay >= threshold ) \
.countByValue()
sorted_flight_tuples = \
sorted( flights_dict.items(), key=itemgetter(1), reverse=True )
return render_template('delays.html', tuples=sorted_flight_tuples[:5])
if __name__ == '__main__':
global flights_rdd
sc = SparkContext()
flights_rdd = \
sc.textFile('/tmp/flights.csv', 4) \
.map( lambda s: s.split(',') ) \
.map( lambda l: ( l[0][:4], int(lst[1]) ) ) \
.cache()
app.config['DEBUG'] = True
app.run(host='0.0.0.0')
Thanks in advance.
You probably shouldn't create "global" resources such as the SparkContext in the __main__ section.
In particular, if you run your app in debug mode the module is instantly reloaded a second time upon start - hence the attempt to create a second SparkContext. (Add e.g. print 'creating sparkcontext' to your __main__ section before creating the SparkContext - you'll see it twice).
Check the flask documenation for proposals on how to cache global resources.
Following http://flask.pocoo.org/docs/0.10/appcontext/#context-usage you could e.g. retrieve the SparkContext as follows:
from flask import g
def get_flights():
flights_rdd = getattr(g, '_flights_rdd', None)
if flights_rdd is None:
# create flights_rdd on the fly
sc = g._sc = SparkContext()
flights_rdd = \
sc.textFile('/tmp/flights.csv', 4) \
.map( lambda s: s.split(',') ) \
.map( lambda l: ( l[0][:4], int(lst[1]) ) ) \
.cache()
g._flights_rdd = flights_rdd
return flights_rdd
#app.teardown_appcontext
def teardown_sparkcontext(exception):
sc = getattr(g, '_sc', None)
if sc is not None:
sc.close()
Then use flights_rdd = get_flights() instead of the global flights_rdd.

Resources