Only one SparkContext may be running in this JVM - Flask - apache-spark

Can anyone provide guidance on why this simple Flask app complains about Only one SparkContext may be running in this JVM. I'm not attempting to load more than one context, obviously.
Code:
import flask
from pyspark import SparkContext
from operator import itemgetter
app = flask.Flask(__name__)
#app.route('/')
def homepage():
return 'Example: /dt/140'
#app.route('/dt/<int:delaythreshold>')
def dt(delaythreshold):
global flights_rdd
flights_dict = \
flights_rdd \
.filter( lambda (day, delay): delay >= threshold ) \
.countByValue()
sorted_flight_tuples = \
sorted( flights_dict.items(), key=itemgetter(1), reverse=True )
return render_template('delays.html', tuples=sorted_flight_tuples[:5])
if __name__ == '__main__':
global flights_rdd
sc = SparkContext()
flights_rdd = \
sc.textFile('/tmp/flights.csv', 4) \
.map( lambda s: s.split(',') ) \
.map( lambda l: ( l[0][:4], int(lst[1]) ) ) \
.cache()
app.config['DEBUG'] = True
app.run(host='0.0.0.0')
Thanks in advance.

You probably shouldn't create "global" resources such as the SparkContext in the __main__ section.
In particular, if you run your app in debug mode the module is instantly reloaded a second time upon start - hence the attempt to create a second SparkContext. (Add e.g. print 'creating sparkcontext' to your __main__ section before creating the SparkContext - you'll see it twice).
Check the flask documenation for proposals on how to cache global resources.
Following http://flask.pocoo.org/docs/0.10/appcontext/#context-usage you could e.g. retrieve the SparkContext as follows:
from flask import g
def get_flights():
flights_rdd = getattr(g, '_flights_rdd', None)
if flights_rdd is None:
# create flights_rdd on the fly
sc = g._sc = SparkContext()
flights_rdd = \
sc.textFile('/tmp/flights.csv', 4) \
.map( lambda s: s.split(',') ) \
.map( lambda l: ( l[0][:4], int(lst[1]) ) ) \
.cache()
g._flights_rdd = flights_rdd
return flights_rdd
#app.teardown_appcontext
def teardown_sparkcontext(exception):
sc = getattr(g, '_sc', None)
if sc is not None:
sc.close()
Then use flights_rdd = get_flights() instead of the global flights_rdd.

Related

AWS EMR Notebook running core nodes just once

I'm copying data from a MySQL db using spark:
def load_table(table):
print(table)
df = (
spark.read
.format("jdbc")
.option("url", db_url)
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", table)
.option("user", db_user)
.option("password", db_password)
.load()
)
df.write.format("parquet").mode("overwrite").save('s3://MY-BUCKET-NAME/raw/DATABASE/{tb}/{db}/'.format(tb = table_name, db = db_name))
this piece of code ir running ok but to make it faster, I'm using threads:
from threading import Thread
from queue import Queue
q = Queue()
# print(table_list)
worker_count = 1
def run_tasks(function, q):
while not q.empty():
value = q.get()
function(value)
q.task_done()
for table in db_tables:
q.put(table)
# threads = []
# for i in range(worker_count):
# t=Thread(target=run_tasks, args=(load_table, q))
# t.daemon = True
# t.start()
t = Thread(target=run_tasks, args=(load_table, q), daemon=True).start()
print('Running load')
q.join()
print('Load completed')
The problem is: It's running OK, but just 4 times (I have a spark cluster with 1 master and 4 core nodes). Why my core nodes are not getting another job when It finishes?

Perform INSERT INTO ... SELECT in AWS GLUE

The following script populates a target table with the data fetched from a source table using pyspark.sql and runs without problems in AWS Glue:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
users.toDF().createOrReplaceTempView("users")
query_users = """
SELECT U.id
, signup_from
FROM users AS U
"""
users_df = spark.sql(query_users)
users_dynamicframe = DynamicFrame.fromDF(
users_df.repartition(1), glueContext, "users_dynamicframe"
)
users_output = glueContext.write_dynamic_frame.from_catalog(
frame=users_dynamicframe,
database="target",
table_name="target_users",
transformation_ctx="users_output",
)
job.commit()
Now, I would like to perform an INSERT INTO SELECT ... ON DUPLICATE KEY UPDATE ...
and I wrote the following script:
source_users = glueContext.create_dynamic_frame.from_catalog(
database="source", table_name="source_users"
)
target_users = glueContext.create_dynamic_frame.from_catalog(
database = "target", table_name = "target_users"
)
source_users.toDF().createOrReplaceTempView("source_users")
target_users.toDF().createOrReplaceTempView("target_users")
query = """
INSERT INTO target_users
SELECT U.id
, U.user_type
FROM source_users
on duplicate key update id=target_users.id
"""
target_output = spark.sql(query)
job.commit()
which returns the following
ParseException: "\nmismatched input 'on' expecting <EOF>
I am not sure how to achieve this, and the reason why I am trying this is to reflect in the target table the updates happening in the source table.
Any help in this direction would be massively appreciated,
Thanks!

Cannot read from BigQuery

I try to read a simple BigQuery table.
This hangs on:
WARNING:root:Dataset thijs-dev:temp_dataset_b234824381e04e1324234237724b485f95c does not exist so we will create it as temporary with location=EU
For this I use the following script:
python main.py \
--runner DirectRunner \
--project thijs-dev \
--temp_location gs://thijs/tmp/ \
--job_name thijs-dev-load \
--save_main_session
And the complete Python script:
import apache_beam as beam
import logging
import argparse
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
with beam.Pipeline(argv=pipeline_args) as p:
""" Read all data from source_table """
source_data = (p | beam.io.Read(beam.io.BigQuerySource(query="select * from `thijs-dev.metathijs.thijs_locations`", use_standard_sql=True)))
if __name__ == '__main__':
print("Start")
logging.getLogger().setLevel(logging.INFO)
run()
Turns out Dataflow is just extremely slow. It takes half an hour to process 26MB of data but it is working afterall.

pyspark streaming from kinesis kills heap

Running a sample application streaming data from kinesis. I did not get why this application uses so much heap and crashes.
Here is the code :
from __future__ import print_function
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
from pyspark.sql.session import SparkSession
from datetime import datetime
# function declaration
def isDfEmpty(df):
try:
if not df.take(1) :
return True
except Exception as e:
return True
return False
# function declaration
def mergeTable(df):
print("b:mergeTable")
print(str(datetime.now()))
try:
global refDf
if isDfEmpty(df) :
print("no record, waiting !")
else :
if(isDfEmpty(refDf)) :
refDf = df
else :
print(" before count %s" % refDf.count())
refDf = df.unionAll(refDf)
print(" after count %s" % refDf.count())
except Exception as e:
print(e)
print(str(datetime.now()))
print("e:mergeTable")
# function declaration
def doWork(df):
print("b:doWork")
print(str(datetime.now()))
try:
mergeTable(df)
except Exception as e:
print(e)
print(str(datetime.now()))
print("e:doWork")
# function declaration
def sensorFilter(sensorType, rdd):
df = spark.read.json(rdd.filter(lambda x : sensorType in x))
doWork(df)
def printRecord(rdd):
print("========================================================")
print("Starting new RDD")
print("========================================================")
sensorFilter("SensorData", rdd)
refDf = None
if __name__ == "__main__":
reload(sys)
# sys.setdefaultencoding('utf-8')
if len(sys.argv) != 5:
print( "Usage: dump.py <app-name> <stream-name> <endpoint-url> <region-name>", file=sys.stderr)
sys.exit(-1)
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
# sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl")
ssc = StreamingContext(sc, 10)
appName, streamName, endpointUrl, regionName = sys.argv[1:]
dstream = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 10)
dstream.foreachRDD(printRecord)
ssc.start()
ssc.awaitTermination()
After a time the spark application slowed down due to heap usage. But when i comment out the lines, heap usage decrease to normal levels.(According to SparkUI)
print(" before count %s" % refDf.count())
print(" after count %s" % refDf.count())
I am really new with pyspark and trying to get what is going on.
Merging on data frame continuously may explode the memory of course but the problem of heap occurs very beginning.
EDIT
Environment : Tried on single ubuntu and on cents VM hosted by macOS nothing changed.

Spark Streaming - updateStateByKey and caching data

I have a problem with using updateStateByKey function and caching some big data at the same time. Here is a example.
Lets say I get data (lastname,age) from kafka. I want to keep actual age for every person so I use updateStateByKey. Also I want to know name of every person so I join output with external table (lastname,name) e.g. from Hive. Lets assume it's really big table, so I don't want to load it in every batch. And there's a problem.
All works well, when I load table in every batch, but when I try to cache table, StreamingContext doesn't start. I also tried to use registerTempTable and later join data with sql but i got the same error.
Seems like the problem is checkpoint needed by updateStateByKey. When I remove updateStateByKey and leave checkpoint i got error, but when I remove both it works.
Error I'm getting: pastebin
Here is code:
import sys
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
# function to keep actual state
def updateFunc(channel, actualChannel):
if (actualChannel is None or not channel is None):
try:
actualChannel = channel[-1]
except Exception:
pass
if channel is None:
channel = actualChannel
return actualChannel
def splitFunc(row):
row = row.strip()
lname,age = row.split()
return (lname,age)
def createContext(brokers,topics):
# some conf
conf = SparkConf().setAppName(appName).set("spark.streaming.stopGracefullyOnShutdown","true").set("spark.dynamicAllocation.enabled","false").\
set("spark.serializer","org.apache.spark.serializer.KryoSerializer").set("spark.sql.shuffle.partitions",'100')
# create SparkContext
sc = SparkContext(conf=conf)
# create HiveContext
sqlContext = HiveContext(sc)
# create Streaming Context
ssc = StreamingContext(sc, 5)
# read big_df and cache (not work, Streaming Context not start)
big_df = sqlContext.sql('select lastname,name from `default`.`names`')
big_df.cache().show(10)
# join table
def joinTable(time,rdd):
if rdd.isEmpty()==False:
df = HiveContext.getOrCreate(SparkContext.getOrCreate()).createDataFrame(rdd,['lname','age'])
# read big_df (work)
#big_df = HiveContext.getOrCreate(SparkContext.getOrCreate()).sql('select lastname,name from `default`.`names`')
# join DMS
df2 = df.join(big_df,df.lname == big_df.lastname,"left_outer")
return df2.map(lambda row:row)
# streaming
kvs = KafkaUtils.createDirectStream(ssc, [topics], {'metadata.broker.list': brokers})
kvs.map(lambda (k,v): splitFunc(v)).updateStateByKey(updateFunc).transform(joinTable).pprint()
return ssc
if __name__ == "__main__":
appName="SparkCheckpointUpdateSate"
if len(sys.argv) != 3:
print("Usage: SparkCheckpointUpdateSate.py <broker_list> <topic>")
exit(-1)
brokers, topics = sys.argv[1:]
# getOrCreate Context
checkpoint = 'SparkCheckpoint/checkpoint'
ssc = StreamingContext.getOrCreate(checkpoint,lambda: createContext(brokers,topics))
# start streaming
ssc.start()
ssc.awaitTermination()
Can you tell me how to properly cache data when checkpoint is enabled? Maybe there is some workaround I don't know.
Spark ver. 1.6
I get this working using lazily instantiated global instance of big_df. Something like that is done in recoverable_network_wordcount.py
.
def getBigDf():
if ('bigdf' not in globals()):
globals()['bigdf'] = HiveContext.getOrCreate(SparkContext.getOrCreate()).sql('select lastname,name from `default`.`names`')
return globals()['bigdf']
def createContext(brokers,topics):
...
def joinTable(time,rdd):
...
# read big_df (work)
big_df = getBigDF()
# join DMS
df2 = df.join(big_df,df.lname == big_df.lastname,"left_outer")
return df2.map(lambda row:row)
...
Seems like in streaming all data must be cached inside streaming processing, not before.

Resources