pyspark getting distinct values based on groupby column for streaming data - apache-spark

i am trying to get distinct values for a column based on groupby operation on other column using pyspark stream, but i am getting in correct count.
Function created:
from pyspark.sql.functions import weekofyear,window,approx_count_distinct
def silverToGold(silverPath, goldPath, queryName):
(spark.readStream
.format("delta")
.load(silverPath)
.withColumn("week",weekofyear("eventDate"))
#.groupBy(window(col(("week")).cast("timestamp"),"5 minute")).approx_count_distinct("device_id")
# .withColumn("WAU",col("window.start"))
# .drop("window")
.groupBy("week").agg(approx_distinct.count("device_id").alias("WAU"))
.writeStream
.format("delta")
.option("checkpointLocation",goldPath + "/_checkpoint")
#.option("streamName",queryName)
.queryName(queryName)
.outputMode("complete")
.start(goldPath)
#return queryName
)
Expected Result:
week WAU
1 7
2 4
3 9
4 9
Actual Result:
week WAU
1 7259
2 7427
3 7739
4 7076
Sample Input Data:
Input data in text format:
device_id,eventName,client_event_time,eventDate,deviceType
00007d948fbe4d239b45fe59bfbb7e64,scoreAdjustment,2018-06-01T16:55:40.000+0000,2018-06-01,android
00007d948fbe4d239b45fe59bfbb7e64,scoreAdjustment,2018-06-01T16:55:34.000+0000,2018-06-01,android
0000a99151154e4eb14c675e8b42db34,scoreAdjustment,2019-08-18T13:39:36.000+0000,2019-08-18,ios
0000b1e931d947b197385ac1cbb25779,scoreAdjustment,2018-07-16T09:13:45.000+0000,2018-07-16,android
0003939e705949e4a184e0a853b6e0af,scoreAdjustment,2018-07-17T17:59:05.000+0000,2018-07-17,android
0003e14ca9ba4198b51cec7d2761d391,scoreAdjustment,2018-06-10T09:09:12.000+0000,2018-06-10,ios
00056f7c73c9497180f2e0900a0626e3,scoreAdjustment,2019-07-05T18:31:10.000+0000,2019-07-05,ios
0006ace2d1db46ba94b802d80a43c20f,scoreAdjustment,2018-07-05T14:31:43.000+0000,2018-07-05,ios
000718c45e164fb2b017f146a6b66b7e,scoreAdjustment,2019-03-26T08:25:08.000+0000,2019-03-26,android
000807f2ea524bd0b7e27df8d44ab930,purchaseEvent,2019-03-26T22:28:17.000+0000,2019-03-26,android
Any suggestions on this

def silverToGold(silverPath, goldPath, queryName):
return (spark.readStream
.format("delta")
.load(silverPath)
.groupBy(weekofyear('eventDate').alias('week'))
.agg(approx_count_distinct("device_id",rsd=0.01).alias("WAU"))
.writeStream
.format("delta")
.option("checkpointLocation", goldPath +"/_checkpoint")
.outputMode("complete")
.start(goldPath)
)

Related

How to reliably obtain partition columns of delta table

I need to obtain the partitioning columns of a delta table, but the returned result of a
DESCRIBE delta.`my_table` returns different results on databricks and locally on PyCharm.
Minimal example:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
delta_table_path = "c:/temp_delta_table"
partition_column = ["rs_nr"]
schema = StructType([
StructField("rs_nr", StringType(), False),
StructField("event_category", StringType(), True),
StructField("event_counter", IntegerType(), True)])
data = [{'rs_nr': '001', 'event_category': 'event_01', 'event_counter': 1},
{'rs_nr': '002', 'event_category': 'event_02', 'event_counter': 2},
{'rs_nr': '003', 'event_category': 'event_03', 'event_counter': 3},
{'rs_nr': '004', 'event_category': 'event_04', 'event_counter': 4}]
sdf = spark.createDataFrame(data=data, schema=schema)
sdf.write.format("delta").mode("overwrite").partitionBy(partition_column).save(delta_table_path)
df_descr = spark.sql(f"DESCRIBE delta.`{delta_table_path}`")
df_descr.toPandas()
Shows, on databricks, the partition column(s):
col_name data_type comment
0 rs_nr string None
1 event_category string None
2 event_counter int None
3 # Partition Information
4 # col_name data_type comment
5 rs_nr string None
But when running this locally in PyCharm, I get the following different output:
col_name data_type comment
0 rs_nr string
1 event_category string
2 event_counter int
3
4 # Partitioning
5 Part 0 rs_nr
Parsing both types of return value seems ugly to me, so is there a reason that this is returned like this?
Setup:
In Pycharm:
pyspark = 3.2.3
delta-spark = 2.0.0
In DataBricks:
DBR 11.3 LTS
Spark = 3.3.0 (I just noted that this differs, I will test if 3.3.0 works locally in the meantime)
Scala = 2.12
In PyCharm, I create the connection using:
def get_spark():
spark = SparkSession.builder.appName('schema_checker')\
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
.config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0")\
.config("spark.sql.catalogImplementation", "in-memory")\
.getOrCreate()
return spark
If you're using Python, then instead of executing SQL command that is harder to parse, it's better to use Python API. The DeltaTable instance has a detail function that returns a dataframe with details about the table (doc), and this dataframe has the partitionColumns column that is array of strings with partition columns names. So you can just do:
from delta.tables import *
detailDF = DeltaTable.forPath(spark, delta_table_path).detail()
partitions = detailDF.select("partitionColumns").collect()[0][0]

How do find out the total amount for each month using spark in python

I'm looking for a way to aggregate by month my data. I want firstly to keep only month in my visitdate. My DataFrame looks like this:
Row(visitdate = 1/1/2013,
patientid = P1_Pt1959,
amount = 200,
note = jnut,
)
My objectif subsequently is to group by visitdate and calculate the sum of amount. I tried this :
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
file_path = "G:/Visit Data.csv"
patients = spark.read.csv(file_path,header = True)
patients.createOrReplaceTempView("visitdate")
sqlDF = spark.sql("SELECT visitdate,SUM(amount) as totalamount from visitdate GROUP BY visitdate")
sqlDF.show()
This is the result :
visitdate|totalamount|
+----------+-----------+
| 9/1/2013| 10800.0|
|25/04/2013| 12440.0|
|27/03/2014| 16930.0|
|26/03/2015| 18560.0|
|14/05/2013| 13770.0|
|30/06/2013| 13880.0
My objectif is to get something like this:
visitdate|totalamount|
+----------+-----------+
|1/1/2013| 10800.0|
|1/2/2013| 12440.0|
|1/3/2013| 16930.0|
|1/4/2014| 18560.0|
|1/5/2015| 13770.0|
|1/6/2015| 13880.0|
You need to truncate your date's down to months so they group properly, then do a groupBy/sum. There is a spark function to do this for you call date_trunc. For example.
from datetime import date
from pyspark.sql.functions import date_trunc, sum
data = [
(date(2000, 1, 2), 1000),
(date(2000, 1, 2), 2000),
(date(2000, 2, 3), 3000),
(date(2000, 2, 4), 4000),
]
df = spark.createDataFrame(sc.parallelize(data), ["date", "amount"])
df.groupBy(date_trunc("month", df.date)).agg(sum("amount"))
+-----------------------+-----------+
|date_trunc(month, date)|sum(amount)|
+-----------------------+-----------+
| 2000-01-01 00:00:00| 3000|
| 2000-02-01 00:00:00| 7000|
+-----------------------+-----------+

Use WHERE or FILTER whe creating TempView

Is it possible to use where or filter when creating a SparkSQL TempView ?
I have a Cassandra table words with
word | count
------------
apples | 20
banana | 10
I tried
%spark
val df = sqlContext
.read
.format("org.apache.spark.sql.cassandra")
.options( Map ("keyspace"-> "temp", "table"->"words" ))
.where($"count" > 10)
.load()
.createOrReplaceTempView("high_counted")
or
%spark
val df = sqlContext
.read
.format("org.apache.spark.sql.cassandra")
.options( Map ("keyspace"-> "temp", "table"->"words" ))
.where("count > 10")
.load()
.createOrReplaceTempView("high_counted")
You cannot do a WHERE or FILTER without .load()ing the table as #undefined_variable suggested.
Try:
%spark
val df = sqlContext
.read
.format("org.apache.spark.sql.cassandra")
.options( Map ("keyspace"-> "temp", "table"->"words" ))
.load()
.where($"count" > 10)
.createOrReplaceTempView("high_counted")
Alternatively, you can do a free form query as documented here.
Spark evaluated statements in lazy fashion and the above statement is a Transformation. (If you are thinking we need to filter before we load)

Structured streaming debugging input

Is there a way for me to print out the incoming data? For e.g. I have a readStream on a folder looking for JSON files, however there seems to be an issue as I am seeing 'nulls' in the aggregation output.
val schema = StructType(
StructField("id", LongType, false) ::
StructField("sid", IntegerType, true) ::
StructField("data", ArrayType(IntegerType, false), true) :: Nil)
val lines = spark.
readStream.
schema(schema).
json("in/*.json")
val top1 = lines.groupBy("id").count()
val query = top1.writeStream
.outputMode("complete")
.format("console")
.option("truncate", "false")
.start()
To print the data you can add queryName in the write stream, by using that queryName you can print.
In your Example
val query = top1.writeStream
.outputMode("complete")
.queryName("xyz")
.format("console")
.option("truncate", "false")
.start()
run this and you can display data by using SQL query
%sql select * from xyz
or you can Create Dataframe
val df = spark.sql("select * from xyz")

PYSPARK: UnicodeEncodeError: 'latin-1' codec can't encode characters in position 515-517: ordinal not in range(256)

Attempting to show my resulting dataframe using PySpark and am getting the error mentioned above.
from pyspark.sql.functions import col, count, countDistinct, when, trim, isnull, concat_ws, concat, lit, substring, round, avg, max, min, length, lit, udf
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
# Get stats for all columns in a table
def get_table_stats(table, columns):
udf_to_string = udf(lambda x: str(x), StringType())
# Populate dataframes
table_df = sqlContext.sql('SELECT ' + ','.join(columns) + ' FROM ' + table)
# New df columns
table_headers = ['_column', \
'_max', \
'_min', \
'_avg', \
'_max_length', \
'_min_length', \
'_avg_length']
# Cycle through each column, obtain main stats and put in to dataframe
for index, column in enumerate(table_df.columns):
# Selecting relevant column value and also the length of those values.
# The values must first be converted to strings to obtain the lengths, hence the use of the to_string UDF
length_alias = column + '_lengths'
col_df = table_df.select(column, length(udf_to_string(col(column))).alias(length_alias))
# Relevant aggregates are determined, given aliases and stored as a single row dataframe
aggs_df = col_df.agg(max(column).alias(table_headers[1]), \
min(column).alias(table_headers[2]), \
avg(column).alias(table_headers[3]), \
max(length_alias).alias(table_headers[4]), \
min(length_alias).alias(table_headers[5]), \
avg(length_alias).alias(table_headers[6]))
# Add the column name as a column in our results row dataframe
temp_raw_df = aggs_df.withColumn(table_headers[0], lit(column))
# As we want to concatenate each row of column results to return a full dataframe of results, we must
# ensure all values are of the same type, thus convert every value to a string
temp_df = temp_raw_df.select([temp_raw_df['_column']] + [temp_raw_df[col_to_cast].cast(StringType()) for col_to_cast in temp_raw_df.columns[:-1]])
# Update master_df after each column results are aggregated
if index == 0:
master_df = temp_df
else:
master_df = master_df.union(temp_df)
return master_df
Defining this code and running the following code gives me the error shown below.
>>> mydf = get_table_stats(table, ['index', 'name', 'age'])
>>> mydf.show()
UnicodeEncodeError: 'latin-1' codec can't encode characters in position 515-517: ordinal not in range(256)
Anyone know is this a problem with Spark itself? Or is it a python problem?
I have checked the encoding used in my iPython console and it is 'UTF-8'.

Resources