pyspark udf memory crash - apache-spark

i have function for object detection with opencv gpu mode
def object_detection(image_as_bytes):
try:
.
.
.
net = cv.dnn.readNetFromDarknet("/home/hp/PycharmProjects/pyspark/models/yolov3.cfg",
"/home/user/PycharmProjects/pyspark/models/yolov3-tiny.weights")
.
.
.
return image_metadata
except:
return None
and i call it with this function
def image_proc(s: pd.Series) -> pd.Series:
ls = list(s.to_list())
threads = []
for i in range(len(ls)):
threads.append(pyspark.InheritableThread(target=object_detection, args=(ls[i],)))
threads[i].start()
for i in threads:
i.join()
.
.
.
return pd.Series(result)
my pyspark code read data from kafka stream and for every mini batch call that threads
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "2") \
.load()
im_proc_udf = pandas_udf(im_proc, returnType=StringType())
df = df.withColumn('object_detection_data', im_proc_udf(col('value')))
when i monitored the process i thing for every batch spark load and serialized my yolo weights for all threads and it fills my memory then crash
my question is : can i have load those files to memory or path them some where else to stop fulling my memory

Related

Why the performance of Redis is worse than Hive?

I'm using Hadoop to work on a big data project.
I can use spark to send some SQL command to Hive.
Since this process is slow, I try to write my data into Redis which is an open-source database and use spark to query my data from this database to speed up this process.
I have deployed redis server in my virtual machine, and I can use spark session to read, write and run sql command on redis by using spark-redis module.
https://github.com/RedisLabs/spark-redis
Here's my testing script. I use spark session to get table from hive and write into redis.
from pyspark.sql import SparkSession
import time
import pandas as pd
spark = SparkSession.builder \
.appName("read_and_write") \
.config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
.enableHiveSupport() \
.getOrCreate()
# read table from hive
sparkDF = spark.sql("SELECT * FROM hive_table")
sparkDF.show()
# write table into redis
sparkDF.write.format("org.apache.spark.sql.redis") \
.option("table", "redis_table") \
.mode("overwrite") \
.save()
After writing process finish, I write two script to compare speed between redis and hive.
This script is to test hive:
from pyspark.sql import SparkSession
import time, json
spark = SparkSession.builder \
.appName("hive_spark_test") \
.config("hive.metastore.uris", "thrift://localhost:9083") \
.config("spark.debug.maxToStringFields", "500") \
.config("spark.sql.execution.arrow.enabled", True) \
.config("spark.sql.shuffle.partitions", 20) \
.config("spark.default.parallelism", 20) \
.config("spark.storage.memoryFraction", 0.5) \
.config("spark.shuffle.memoryFraction", 0.3) \
.config("spark.shuffle.consolidateFiles", False) \
.config("spark.shuffle.sort.bypassMergeThreshold", 200) \
.config("spark.shuffle.file.buffer", "32K") \
.config("spark.reducer.maxSizeInFlight", "48M") \
.enableHiveSupport() \
.getOrCreate()
for i in range(20):
# you can use your own sql command
sql_command = "SELECT testColumn1, SUM(testColumn2) AS testColumn2 FROM hive_table WHERE (date BETWEEN '2022-01-01' AND '2022-03-10') GROUP BY GROUPING SETS ((testColumn1))"
readDF = spark.sql(sql_command)
df_json = readDF.toJSON()
df_collect = df_json.collect()
res = [json.loads(i) for i in df_collect]
print(res)
Here's the result. Duration is 0.2s to 0.5s after few round.
enter image description here
This script is to test redis:
from pyspark.sql import SparkSession
import time, json
spark = SparkSession.builder \
.appName("redis_spark_test") \
.config("spark.redis.host", "localhost") \
.config("spark.redis.port", "6379") \
.config("spark.redis.max.pipeline.size", 200) \
.config("spark.redis.scan.count", 200) \
.config("spark.debug.maxToStringFields", "500") \
.config("spark.sql.execution.arrow.enabled", True) \
.config("spark.sql.shuffle.partitions", 20) \
.config("spark.default.parallelism", 20) \
.config("spark.storage.memoryFraction", 0.5) \
.config("spark.shuffle.memoryFraction", 0.3) \
.config("spark.shuffle.consolidateFiles", False) \
.config("spark.shuffle.sort.bypassMergeThreshold", 200) \
.config("spark.shuffle.file.buffer", "32K") \
.config("spark.reducer.maxSizeInFlight", "48M") \
.getOrCreate()
sql_command = """CREATE OR REPLACE TEMPORARY VIEW redis_table (
testColumn1 STRING,
testColumn2 INT,
testColumn3 STRING,
testColumn4 STRING,
date DATE,)
USING org.apache.spark.sql.redis OPTIONS (table 'redis_table')
"""
spark.sql(sql_command)
for i in range(20):
# you can use your own sql command
sql_command = "SELECT testColumn1, SUM(testColumn2) AS testColumn2 FROM redis_table WHERE (date BETWEEN '2022-01-01' AND '2022-03-10') GROUP BY GROUPING SETS ((testColumn1))"
readDF = spark.sql(sql_command)
df_json = readDF.toJSON()
df_collect = df_json.collect()
res = [json.loads(i) for i in df_collect]
print(res)
Here's the result. Duration is 1s to 2s after few round.
enter image description here
This result is conflicted with my survey. Redis should be faster than Hive, but I get the opposite result.
I want to know the reason and try to make Redis can run faster than Hive through Spark if that's possible.
Thank you.

Queries with streaming sources must be executed with writeStream.start(); pyspark

I have trouble when trying to read the messages from kafka and the following exception appear "Queries with streaming sources must be executed with writeStream.start();"
Here my code:
from dataclasses import dataclass
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
#dataclass
class DeviceData:
device: str
temp: float
humd: float
pres: float
spark:SparkSession = SparkSession.builder \
.master("local[1]") \
.appName("StreamHandler") \
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
inputDF = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "weather") \
.load()
rawDF = inputDF.selectExpr("CAST(value AS STRING)")
df_split = inputDF.select(f.split(inputDF.value, ",")) \
.rdd.Map(lambda x: DeviceData(x[0], x[1], x[2], x[3])) \
.toDF(schema=['device', 'temp', 'humd', 'pres'])
summaryDF = df_split.groupBy('device') \
.agg(f.avg('temp'), f.avg('humd'), f.avg('pres'))
query = summaryDF.writeStream.format('console').outputMode('update').start()
query.awaitTermination()

Error while passing dataframe to UDF in Structured Streaming

I am reading events from Kafka in Spark Structured streaming and need to process events one by one and write to redis. I wrote a UDF for that but it gives me spark context error.
conf = SparkConf()\
.setAppName(spark_app_name)\
.setMaster(spark_master_url)\
.set("spark.redis.host", "redis")\
.set("spark.redis.port", "6379")\
.set("spark.redis.auth", "abc")
spark = SparkSession.builder\
.config(conf=conf)\
.getOrCreate()
def func(element, event, timestamp):
#redis i/o
pass
schema = ArrayType(StructType(
[
StructField("element_id", StringType()),
StructField("event_name", StringType()),
StructField("event_time", StringType())
]
))
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka:9092") \
.option("subscribe", topic) \
.load()
#.option("includeTimestamp", value = True)\
ds = df.selectExpr(("CAST(value AS STRING)"))\
.withColumn("value", explode(from_json("value", schema)))
filter_func = udf(func, ArrayType(StringType()))
ds = ds.withColumn("column_name", filter_func(
ds['value']['element_id'],
ds['value']['event_name'],
ds['value']['event_time']
))
query = ds.writeStream \
.format("console") \
.start()
query.awaitTermination()
Error message: _pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
Any help is appreciated.
I was trying to access spark context from within user defined function which is not allowed.
Within the udf, I was trying to write to spark-redis by using spark context.

How to transform dataframes to rdds in structured streaming?

I get data from kafka using pyspark streaming, and the result is a dataframe, when I transform dataframe to rdd, it went wrong:
Traceback (most recent call last):
File "/home/docs/dp_model/dp_algo_platform/dp_algo_core/test/test.py", line 36, in <module>
df = df.rdd.map(lambda x: x.value.split(" ")).toDF()
File "/home/softs/spark-2.4.3-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 91, in rdd
File "/home/softs/spark-2.4.3-bin-hadoop2.6/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/home/softs/spark-2.4.3-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
pyspark.sql.utils.AnalysisException: 'Queries with streaming sources must be executed with writeStream.start();;\nkafka'
the right version code:
spark = SparkSession \
.builder \
.appName("StructuredNetworkWordCount") \
.getOrCreate()
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "test") \
.load()
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
df = df.withColumn("s", F.split(df['value'], " "))
df = df.withColumn('e', F.explode(df['s']))
# df = df.rdd.map(lambda x: x.value.split(" ")).toDF()
q = df.writeStream \
.format("console") \
.trigger(processingTime='30 seconds') \
.start()
q.awaitTermination()
this is the wrong version code:
spark = SparkSession \
.builder \
.appName("StructuredNetworkWordCount") \
.getOrCreate()
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "test") \
.load()
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
# df = df.withColumn("s", F.split(df['value'], " "))
# df = df.withColumn('e', F.explode(df['s']))
df = df.rdd.map(lambda x: x.value.split(" ")).toDF()
q = df.writeStream \
.format("console") \
.trigger(processingTime='30 seconds') \
.start()
q.awaitTermination()
Why it cannot convert dataframe to rdd? and how can I do when I want to transform dataframe to rdd in pyspark streaming?
If your spark version is 2.4.0 and above then u can use below alternative to play around with each row of your dataframe.
query=df.writeStream.foreach(Customized method to work on each row of dataframe rather than RDD).outputMode("update").start()
ssc.start()
ssc.awaitTermination()
This RDD aspect is simply NOT supported. RDDs are legacy and Spark Structured Streaming is DF/DS based. Common abstraction whether streaming or batch.
To perform specific actions over your Dataframe fields you can use UDF functions or even you can create your Spark Custom Transformers. But there are some Dataframe operations that are not supported like transforming to RDD.
structured streaming is running on the spark-sql enginer.Conversion of dataframe or dataset to RDD is not supported.

Pyspark Structured streaming processing

I am trying to make a structured streaming application with spark the main idea is to read from a kafka source, process the input, write back to another topic. i have successfully made spark read and write from and to kafka however my problem is with the processing part. I have tried the foreach function to capture every row and process it before writing back to kafka however it always only does the foreach part and never writes back to kafka. If i however remove the foreach part from the writestream it would continue writing but now i lost my processing.
if anyone can give me an example on how to do this with an example i would be extremely grateful.
here is my code
spark = SparkSession \
.builder \
.appName("StructuredStreamingTrial") \
.getOrCreate()
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "KafkaStreamingSource") \
.load()
ds = df \
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
.writeStream \
.outputMode("update") \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("topic", "StreamSink") \
.option("checkpointLocation", "./testdir")\
.foreach(foreach_function)
.start().awaitTermination()
and the foreach_function simply is
def foreach_function(df):
try:
print(df)
except:
print('fail')
pass
Processing the data before writing into Kafka sink in Pyspark based Structured Streaming API,we can easily handle with UDF function for any kind of complex transformation .
example code is in below . This code is trying to read the JSON format message Kafka topic and parsing the message to convert the message from JSON into CSV format and rewrite into another topic. You can handle any processing transformation in place of 'json_formatted' function .
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext
from pyspark.sql.column import Column, _to_java_column
from pyspark.sql.functions import col, struct
from pyspark.sql.functions import udf
import json
import csv
import time
import os
# Spark Streaming context :
spark = SparkSession.builder.appName('pda_inst_monitor_status_update').getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 20)
# Creating readstream DataFrame :
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "KafkaStreamingSource") \
.load()
df1 = df.selectExpr( "CAST(value AS STRING)")
df1.registerTempTable("test")
def json_formatted(s):
val_dict = json.loads(s)
return str([
val_dict["after"]["ID"]
, val_dict["after"]["INST_NAME"]
, val_dict["after"]["DB_UNIQUE_NAME"]
, val_dict["after"]["DBNAME"]
, val_dict["after"]["MON_START_TIME"]
, val_dict["after"]["MON_END_TIME"]
]).strip('[]').replace("'","").replace('"','')
spark.udf.register("JsonformatterWithPython", json_formatted)
squared_udf = udf(json_formatted)
df1 = spark.table("test")
df2 = df1.select(squared_udf("value"))
# Declaring the Readstream Schema DataFrame :
df2.coalesce(1).writeStream \
.writeStream \
.outputMode("update") \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("topic", "StreamSink") \
.option("checkpointLocation", "./testdir")\
.start()
ssc.awaitTermination()

Resources