Issue while running Spark application on Yarn - apache-spark

I have a testing spark environment(Single Node) running on AWS. I executed few adhoc queries in PySpark shell and everything went as expected, however, when I'm running the application using spark-submit , I get error.
Below is the code:
from __future__ import print_function
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext as sql
conf = SparkConf().setAppName("myapp")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
if __name__ == "__main__":
#inp_data = loaded data from db
df = inp_data.select('Id','DueDate','Principal','delay','unpaid_emi','future_payment')
filterd_unpaid_emi = df.filter(df.unpaid_emi == 1)
par = filterd_unpaid_emi.groupBy('Id').sum('Principal').withColumnRenamed('sum(Principal)' , 'par')
temp_df = df.filter(df.unpaid_emi == 1)
temp_df_1 = temp_df.filter(temp_df.future_payment == 0)
temp_df_1.registerTempTable("mytable")
bucket_df_1 = sql("""select *, case
when delay<0 and delay ==0 then '9999'
when delay>0 and delay<7 then '9'
when delay>=7 and delay<=14 then '8'
when delay>=15 and delay<=29 then '7'
when delay>=30 and delay<=59 then '6'
when delay>=60 and delay<=89 then '5'
when delay>=90 and delay<=119 then '4'
when delay>=120 and delay<=149 then '3'
when delay>=150 and delay<=179 then '2'
else '1'
end as bucket
from mytable""")
bucket_df_1 = bucket_df_1.select(bucket_df_1.Id,bucket_df_1.Principal,bucket_df_1.delay,bucket_df_1.unpaid_emi,bucket_df_1.future_payment,bucket_df_1.bucket.cast("int").alias('buckets'))
min_bucket = bucket_df_1.groupBy('Id').min('buckets').withColumnRenamed('min(buckets)' , 'max_delay')
joinedDf = par.join(min_bucket, ["Id"])
#joinedDf.printSchema()
And below is the command to submit the application:
spark-submit \
--master yarn \
--driver-class-path /path to/mysql-connector-java-5.0.8-bin.jar \
--jars /path to/mysql-connector-java-5.0.8-bin.jar \
/path to/mycode.py
ERROR:
17/11/10 10:00:34 INFO SparkSqlParser: Parsing command: mytable
Traceback (most recent call last):
File "/path to/mycode.py", line 36, in <module>
from mytable""")
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/context.py", line 73, in __init__
AttributeError: 'str' object has no attribute '_jsc'
17/11/10 10:00:34 INFO SparkContext: Invoking stop() from shutdown hook
17/11/10 10:00:34 INFO SparkUI: Stopped Spark web UI at ........
I'm quite new to Spark so can someone please tell the mistake(s) i'm doing.?
Also, any feedback on improving coding style will be appreciated!
Spark Version : 2.2

You are using the imported SQLContext as sql to query your temp table (which is not bound to any spark instances), not the spark.sql (from the initialized spark instance). I also, changed some of your imports and code.
from __future__ import print_function
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
if __name__ == "__main__":
# move the initializations within the main
conf = SparkConf().setAppName("myapp")
# create the session
spark = SparkSession.builder.config(conf=conf) \
.getOrCreate()
# load your data and do what you need to do
#inp_data = loaded data from db
df = inp_data.select('Id','DueDate','Principal','delay','unpaid_emi','future_payment')
filterd_unpaid_emi = df.filter(df.unpaid_emi == 1)
par = filterd_unpaid_emi.groupBy('Id').sum('Principal').withColumnRenamed('sum(Principal)' , 'par')
temp_df = df.filter(df.unpaid_emi == 1)
temp_df_1 = temp_df.filter(temp_df.future_payment == 0)
temp_df_1.registerTempTable("mytable")
# use spark.sql to query your table
bucket_df_1 = spark.sql("""select *, case
when delay<0 and delay ==0 then '9999'
when delay>0 and delay<7 then '9'
when delay>=7 and delay<=14 then '8'
when delay>=15 and delay<=29 then '7'
when delay>=30 and delay<=59 then '6'
when delay>=60 and delay<=89 then '5'
when delay>=90 and delay<=119 then '4'
when delay>=120 and delay<=149 then '3'
when delay>=150 and delay<=179 then '2'
else '1'
end as bucket
from mytable""")
bucket_df_1 = bucket_df_1.select(bucket_df_1.Id,bucket_df_1.Principal,bucket_df_1.delay,bucket_df_1.unpaid_emi,bucket_df_1.future_payment,bucket_df_1.bucket.cast("int").alias('buckets'))
min_bucket = bucket_df_1.groupBy('Id').min('buckets').withColumnRenamed('min(buckets)' , 'max_delay')
joinedDf = par.join(min_bucket, ["Id"])
#joinedDf.printSchema()
Hope this helps, good luck!

Related

Pyspark Job with Dataproc on GCP

I'm trying to running a pyspark job, but I keep getting job failure for this reason:
*Google Cloud Dataproc Agent reports job failure. If logs are available, they can be found at: https://console.cloud.google.com/dataproc/jobs/f8f8e95794e0457d80ea1b0c4df8d815?project=long-state-352923&region=us-central1 gcloud dataproc jobs wait 'f8f8e95794e0457d80ea1b0c4df8d815' --region 'us-central1' --project 'long-state-352923' **...***
here is also my code in running the job:
`from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('spark_hdfs_to_hdfs') \
.getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("WARN")
MASTER_NODE_INSTANCE_NAME="cluster-d687-m"
log_files_rdd = sc.textFile('hdfs://{}/data/logs_example/*'.format(MASTER_NODE_INSTANCE_NAME))
splitted_rdd = log_files_rdd.map(lambda x: x.split(" "))
selected_col_rdd = splitted_rdd.map(lambda x: (x[0], x[3], x[5], x[6]))
columns = ["ip","date","method","url"]
logs_df = selected_col_rdd.toDF(columns)
logs_df.createOrReplaceTempView('logs_df')
sql = """
SELECT
url,
count(*) as count
FROM logs_df
WHERE url LIKE '%/article%'
GROUP BY url
"""
article_count_df = spark.sql(sql)
print(" ### Get only articles and blogs records ### ")
article_count_df.show(5)`
i don't seem to understand the reasoning why its failing.
Is there a problem with code?

PySpark- Error accessing broadcast variable in udf while running in standalone cluster mode

#f.pandas_udf(returnType= DoubleType())
def square(r : pd.Series) -> pd.Series:
print('In pandas Udf square')
offset_value = offset.value
return (r * r ) + 10
if __name__ == "__main__":
spark = SparkSession.builder.appName("Spark").getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
offset = sc.broadcast(10)
x = pd.Series(range(0,100))
df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))
df = df.withColumn('sq',square(df.x)).withColumn('sqsq', square(f.col('sq')))
start_time = datetime.datetime.now()
df.show()
offset.unpersist()
offset.destroy()
spark.stop()
The above code works well if i run pyspark submit command in local mode
Submit.cmd --master local[*] test.py
Same code, if i try to run in standalone cluster mode, i.e
Submit.cmd --master spark://xx.xx.0.24:7077 test.py
I get error while accessing broadcast variable in udf
java.io.IOException: Failed to delete original file 'C:\Users\xxx\AppData\Local\Temp\spark-bf6b4553-f30f-4e4a-a7f7-ef117329985c\executor-3922c28f-ed1e-4348-baa4-4ed08e042b76\spark-b59e518c-a20a-4a11-b96b-b7657b1c79ea\broadcast6537791588721535439' after copy to 'C:\Users\xxx\AppData\Local\Temp\spark-bf6b4553-f30f-4e4a-a7f7-ef117329985c\executor-3922c28f-ed1e-4348-baa4-4ed08e042b76\blockmgr-ee27f0f0-ee8b-41ea-86d6-8f923845391e\37\broadcast_0_python'
at org.apache.commons.io.FileUtils.moveFile(FileUtils.java:2835)
at org.apache.spark.storage.DiskStore.moveFileToBlock(DiskStore.scala:133)
at org.apache.spark.storage.BlockManager$TempFileBasedBlockStoreUpdater.saveToDiskStore(BlockManager.scala:424)
at org.apache.spark.storage.BlockManager$BlockStoreUpdater.$anonfun$save$1(BlockManager.scala:343)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1298)
Without accessing broadcast variable in Udf, this code works fine.

How to direct stream(kafka) a JSON file in spark and convert it into RDD?

Wrote a code that direct streams(kafka) word count when file is given(in producer)
code :
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
## Constants
APP_NAME = "PythonStreamingDirectKafkaWordCount"
##OTHER FUNCTIONS/CLASSES
def main():
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
Need to convert the input json file to spark Dataframe using Dstream.
This should work:
Once you have your variable containing the TransformedDStream kvs, you can just create a map of DStreams and pass the data to a handler function like this:
data = kvs.map( lambda tuple: tuple[1] )
data.foreachRDD( lambda yourRdd: readMyRddsFromKafkaStream( yourRdd ) )
You should define the handler function that should create the dataframe using your JSON data:
def readMyRddsFromKafkaStream( readRdd ):
# Put RDD into a Dataframe
df = spark.read.json( readRdd )
df.registerTempTable( "temporary_table" )
df = spark.sql( """
SELECT
*
FROM
temporary_table
""" )
df.show()
Hope it helps my friends :)

Creating a stream from a text file in Pyspark

I'm getting the following error when I try to create a stream from a text file in Pyspark:
TypeError: unbound method textFileStream() must be called with StreamingContext instance as first argument (got str instance instead)
I don't want to use SparkContext because I get another error so to remove thet error I have to use SparkSession.
My code:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.mllib.stat import Statistics
if __name__ == "__main__":
spark = SparkSession.builder.appName("CrossCorrelation").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 5)
input_path1 = sys.argv[1]
input_path2 = sys.argv[2]
ds1 = ssc.textFileStream(input_path1)
lines1 = ds1.map(lambda x1: x1[1])
windowedds1 = lines1.flatMap(lambda line1: line1.strip().split("\n")).map(lambda strelem1: float(strelem1)).window(5,10)
ds2 = ssc.textFileStream(input_path2)
lines2 = ds2.map(lambda x2: x2[1])
windowedds2 = lines2.flatMap(lambda line2: line2.strip().split("\n")).map(lambda strelem2: float(strelem2)).window(5,10)
result = Statistics.corr(windowedds1,windowedds2, method="pearson")
if result > 0.7:
print("ds1 and ds2 are correlated!!!")
spark.stop()
Thank you!
You have to first create streamingcontext object and then use it to call textFileStream.
spark =
SparkSession.builder.appName("CrossCorrelation").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 1)
ds = ssc.textFileStream(input_path)

averaging of data using apache spark streaming

I'm using python
I'm receiving json dictionaries through kafka into spark stream,
JSON is like {"a":10}{"a":20} (One dict means one kafka message), key will be "a" always but how much dictionaries, that is not sure.
Now I want average of 10 and 20 in above case.
As per the my knowledge averageByKey may be useful.
But how to use, that I dont know.
Any help would be great!
Thank you for reading
.
.
.
.
.
Update
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
def createContext():
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_02")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 60)
kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming-consumer', {'':1})
raw = kafkaStream.map(lambda kafkaS: kafkaS[1])
clean = raw.map(lambda v: json.loads(v))
print (dir(clean))
clean.pprint()
add=clean.map(lambda xs: ('Total',xs['hello'])).reduceByKey(lambda a, b: a+b)
add.pprint()
count_var = clean.count()
count_var.pprint()
average = add.map(lambda tpl: tpl[1]/float(60))
average.pprint()
return ssc
if __name__ == "__main__":
ssc = StreamingContext.getOrCreate('/path/checkpoint_v'+sys.argv[1],lambda: createContext())
ssc.start()
ssc.awaitTermination()
Now
in above program I'm getting add.pprint() output as below for example:
Stream is like:
{u'hello': 26}
{u'hello': 28}
{u'hello': 31}
{u'hello': 35}
{u'hello': 40}
{u'hello': 46}
>('Total',206)
and output of count_var.pprint() as below for example:
> 6
The question is, in below line
> average = add.map(lambda tpl: tpl[1]/float(60))
I want to use value of count_var.pprint()(which is 6) instead of static value 60
So how can I use stream object as integer in above operation
First, you need to map your event to some processable type, for example tuple. Then you can use just classic "map -> reduceByKey -> map" to calculate the average like this:
import json
ssc = StreamingContext(spark.sparkContext, 1)
dstream = KafkaUtils.createDirectStream(ssc, ['topic'], client_configuration,
valueDecoder=lambda s: json.loads(s.decode('ascii')))
def map_event(raw):
item = list(raw[1].items())[0]
return (item[0], (1, item[1]))
dstream.map(map_event).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
.map(lambda r: (r[0], float(r[1][1]) / r[1][0])) \
.pprint()
ssc.start()

Resources