how to get the latest version of hudi table - apache-spark

I have a spark streaming job in which listens to kinesis stream, then it writes it to hudi table, what I want to do is say for example I added these two records to hudi table:
| user_id | name | timestamp
| -------- | -------------- |----------
| 1 | name1 |1.1.1
| 2 | name2 |1.1.1
this should be reflected in the hudi table initially, but what if I edited in the second record to have the name name2_new
what I expect is that I will have three records each with their timestamp like this:
| user_id | name | timestamp
| -------- | -------------- |----------
| 1 | name1 |1.1.1
| 2 | name2 |1.1.1
| 2 | name2_new |2.2.2
when I query this in athena I get as expected the three records, but what if I want extra analysis in which I want the last version of the records which must be something like this (because I updated in the record 2):
| user_id | name | timestamp
| -------- | -------------- |----------
| 1 | name1 |1.1.1
| 2 | name2_new |2.2.2
is there any idea how I can get the latest version only?
here's the glue job I used to create the hudi table:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.sql.session import SparkSession
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrame, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_timestamp, monotonically_increasing_id, to_date, when
import datetime
from awsglue import DynamicFrame
import boto3
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv,
["JOB_NAME", "database_name", "kinesis_table_name", "starting_position_of_kinesis_iterator",
"hudi_table_name", "window_size", "s3_path_hudi", "s3_path_spark"])
spark = SparkSession.builder.config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer').config(
'spark.sql.hive.convertMetastoreParquet', 'false').getOrCreate()
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
database_name = args["database_name"]
kinesis_table_name = args["kinesis_table_name"]
hudi_table_name = args["hudi_table_name"]
s3_path_hudi = args["s3_path_hudi"]
s3_path_spark = args["s3_path_spark"]
print("***********")
print(f"""
database_name {database_name}
kinesis_table_name = {kinesis_table_name}
hudi_table_name ={hudi_table_name}
s3_path_hudi = {s3_path_hudi}
s3_path_spark = {s3_path_spark}
""")
# can be set to "latest", "trim_horizon" or "earliest"
starting_position_of_kinesis_iterator = args["starting_position_of_kinesis_iterator"]
# The amount of time to spend processing each batch
window_size = args["window_size"]
data_frame_DataSource0 = glueContext.create_data_frame.from_catalog(
database=database_name,
table_name=kinesis_table_name,
transformation_ctx="DataSource0",
additional_options={"inferSchema": "true", "startingPosition": starting_position_of_kinesis_iterator}
)
# config
commonConfig = {
'path': s3_path_hudi
}
hudiWriteConfig = {
'className': 'org.apache.hudi',
'hoodie.table.name': hudi_table_name,
'hoodie.datasource.write.operation': 'upsert',
'hoodie.datasource.write.table.type': 'MERGE_ON_READ',
'hoodie.datasource.write.precombine.field': 'timestamp',
'hoodie.datasource.write.recordkey.field': 'user_id,timestamp',
#'hoodie.datasource.write.partitionpath.field': 'year:SIMPLE,month:SIMPLE,day:SIMPLE',
#'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.CustomKeyGenerator',
#'hoodie.deltastreamer.keygen.timebased.timestamp.type': 'DATE_STRING',
#'hoodie.deltastreamer.keygen.timebased.input.dateformat': 'yyyy-mm-dd',
#'hoodie.deltastreamer.keygen.timebased.output.dateformat': 'yyyy/MM/dd'
}
hudiGlueConfig = {
'hoodie.datasource.hive_sync.enable': 'true',
'hoodie.datasource.hive_sync.sync_as_datasource': 'false',
'hoodie.datasource.hive_sync.database': database_name,
'hoodie.datasource.hive_sync.table': hudi_table_name,
'hoodie.datasource.hive_sync.use_jdbc': 'false',
#'hoodie.datasource.write.hive_style_partitioning': 'false',
#'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor',
#'hoodie.datasource.hive_sync.partition_fields': 'year,month,day'
}
combinedConf = {
**commonConfig,
**hudiWriteConfig,
**hudiGlueConfig
}
# ensure the incomong record has the correct current schema, new fresh columns are fine, if a column exists in current schema but not in incoming record then manually add before inserting
def evolveSchema(kinesis_df, table, forcecast=False):
try:
# get existing table's schema
print("in evolve schema")
kinesis_df.show(truncate=False)
glue_catalog_df = spark.sql("SELECT * FROM " + table + " LIMIT 0")
# sanitize for hudi specific system columns
#columns_to_drop = ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key',
# '_hoodie_partition_path', '_hoodie_file_name']
columns_to_drop = ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key',
'_hoodie_file_name']
glue_catalog_df_sanitized = glue_catalog_df.drop(*columns_to_drop)
if (kinesis_df.schema != glue_catalog_df_sanitized.schema):
merged_df = kinesis_df.unionByName(glue_catalog_df_sanitized, allowMissingColumns=True)
return (merged_df)
except Exception as e:
print(e)
return (kinesis_df)
def processBatch(data_frame, batchId):
print("data frame is")
data_frame.show(truncate=False)
schema_main = StructType(
[
StructField('data', StringType(), True),
StructField('metadata', StringType(), True)
]
)
schema_data = StructType(
[
StructField("user_id",IntegerType(),True),
StructField("firstname",StringType(),True),
StructField("lastname",StringType(),True),
StructField("address", StringType(), True),
StructField("email", StringType(), True)
]
)
schema_metadata = StructType(
[
StructField("timestamp", StringType(), True),
StructField("record-type", StringType(), True),
StructField("operation", StringType(), True),
StructField("partition-key-type", StringType(), True),
StructField("schema-name", StringType(), True),
StructField("table-name" , StringType(), True),
StructField("transaction-id", StringType(), True)
]
)
data_frame = data_frame.withColumn("$json$data_infer_schema$_temporary$", from_json("$json$data_infer_schema$_temporary$", schema_main)).select(col("$json$data_infer_schema$_temporary$.*")).\
withColumn("data",from_json("data", schema_data)).\
withColumn("metadata",from_json("metadata", schema_metadata)).select(col("data.*"),col("metadata.timestamp"))
print("data frame is")
data_frame.show(truncate=False)
#column_headers = list(data_frame.columns)
#print("The Column Header :", column_headers)
#data_frame.printSchema()
if (data_frame.count() > 0):
kinesis_dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext, "from_kinesis_data_frame")
#print("dynamoic frame is")
#kinesis_dynamic_frame.show(truncate=False)
#print('d')
kinesis_data_frame = kinesis_dynamic_frame.toDF()
print("kinesis is")
kinesis_data_frame.show(truncate=False)
kinesis_data_frame = evolveSchema(kinesis_data_frame, database_name + '.' + hudi_table_name, False)
glueContext.write_dynamic_frame.from_options(
frame=DynamicFrame.fromDF(kinesis_data_frame, glueContext, "kinesis_data_frame"),
connection_type="custom.spark",
connection_options=combinedConf
)
glueContext.forEachBatch(
frame=data_frame_DataSource0,
batch_function=processBatch,
options={
"windowSize": window_size,
"checkpointLocation": s3_path_spark
}
)
job.commit()

Usually in this kind of application, we upsert the table and we choose the record with the biggest timestamp when the keys match between the new coming records and the existing records. But since you need the history of all the updates, you have two options to achieve what you are looking for:
Create a new table containing the last version of each record by duplicating your job sinks (kinesis -> full_history_table, kinesis -> last_state_table), or create a stream (mini batches) on the first table using incremental queries (kinesis -> full_history_table -> last_state_table), in this case you will have the result for your two queries without the need to aggregate the data
Aggregate the table on each query: if you query is not frequent, you can aggregate the data on the fly using the window function. You can create a view and apply you query on it:
SELECT user_id, name, timestamp
FROM (
SELECT t.*,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY timestamp DESC) AS rn
FROM TABLE_NAME t
) t
WHERE rn=1;

Related

Access accumulator value after using it in user defined function within df.widthColumn in Palantir Foundry

I am trying to use a customized accumulator within Palantir Foundry to aggregate Data within
a user defined function which is applied to each row of a dataframe within a statement df.withColumn(...).
From the resulting dataframe, I see, that the incrementation of the accumulator-value happens as expected. However, the value of the accumulator variable itself in the script does not change during the execution.
I see, that the Python-ID of the accumulator variable in the script differs from the Python-ID of the accumulator within the user defined function. But that might be expected...
How do I access the accumulator value which incrementation can be watched in the resulting dataframe-colun from within the calling script after the execution, as this is the information I am looking for?
from transforms.api import transform_df, Input, Output
import numpy as np
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.functions import udf, struct
global accum
#transform_df(
Output("ri.foundry.main.dataset.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"),
)
def compute(ctx):
from pyspark.sql.types import StructType, StringType, IntegerType, StructField
data2 = [("James","","Smith","36636","M",3000),
("Michael","Rose","","40288","M",4000),
("Robert","","Williams","42114","M",4000),
("Maria","Anne","Jones","39192","F",4000),
("Jen","Mary","Brown","","F",-1)
]
schema = StructType([ \
StructField("firstname",StringType(),True), \
StructField("middlename",StringType(),True), \
StructField("lastname",StringType(),True), \
StructField("id", StringType(), True), \
StructField("gender", StringType(), True), \
StructField("salary", IntegerType(), True) \
])
df = ctx.spark_session.createDataFrame(data=data2, schema=schema)
####################################
class AccumulatorNumpyArray(AccumulatorParam):
def zero(self, zero: np.ndarray):
return zero
def addInPlace(self, v1, v2):
return v1 + v2
# from pyspark.context import SparkContext
# sc = SparkContext.getOrCreate()
sc = ctx.spark_session.sparkContext
shape = 3
global accum
accum = sc.accumulator(
np.zeros(shape, dtype=np.int64),
AccumulatorNumpyArray(),
)
def func(row):
global accum
accum += np.ones(shape)
return str(accum) + '_' + str(id(accum))
user_defined_function = udf(func, StringType())
new = df.withColumn("processed", user_defined_function(struct([df[col] for col in df.columns])))
new.show(2)
print(accum)
return df
results in
+---------+----------+--------+-----+------+------+--------------------+
|firstname|middlename|lastname| id|gender|salary| processed|
+---------+----------+--------+-----+------+------+--------------------+
| James| | Smith|36636| M| 3000|[1. 1. 1.]_140388...|
| Michael| Rose| |40288| M| 4000|[2. 2. 2.]_140388...|
+---------+----------+--------+-----+------+------+--------------------+
only showing top 2 rows
and
> accum
Accumulator<id=0, value=[0 0 0]>
> id(accum)
140574405092256
If the Foundry-Boiler-Plate is removed, resulting in
import numpy as np
from pyspark.accumulators import AccumulatorParam
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import StructType, StringType, IntegerType, StructField
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
spark = (
SparkSession.builder.appName("Python Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate()
)
# ctx = spark.sparkContext.getOrCreate()
data2 = [
("James", "", "Smith", "36636", "M", 3000),
("Michael", "Rose", "", "40288", "M", 4000),
("Robert", "", "Williams", "42114", "M", 4000),
("Maria", "Anne", "Jones", "39192", "F", 4000),
("Jen", "Mary", "Brown", "", "F", -1),
]
schema = StructType(
[
StructField("firstname", StringType(), True),
StructField("middlename", StringType(), True),
StructField("lastname", StringType(), True),
StructField("id", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True),
]
)
# df = ctx.spark_session.createDataFrame(data=data2, schema=schema)
df = spark.createDataFrame(data=data2, schema=schema)
####################################
class AccumulatorNumpyArray(AccumulatorParam):
def zero(self, zero: np.ndarray):
return zero
def addInPlace(self, v1, v2):
return v1 + v2
sc = SparkContext.getOrCreate()
shape = 3
global accum
accum = sc.accumulator(
np.zeros(shape, dtype=np.int64),
AccumulatorNumpyArray(),
)
def func(row):
global accum
accum += np.ones(shape)
return str(accum) + "_" + str(id(accum))
user_defined_function = udf(func, StringType())
new = df.withColumn(
"processed", user_defined_function(struct([df[col] for col in df.columns]))
)
new.show(2, False)
print(id(accum))
print(accum)
the output obtained within a regular Python environment with pyspark version 3.3.1 on Ubuntu meets the expectations and is
+---------+----------+--------+-----+------+------+--------------------------+
|firstname|middlename|lastname|id |gender|salary|processed |
+---------+----------+--------+-----+------+------+--------------------------+
|James | |Smith |36636|M |3000 |[1. 1. 1.]_139642682452576|
|Michael |Rose | |40288|M |4000 |[1. 1. 1.]_139642682450224|
+---------+----------+--------+-----+------+------+--------------------------+
only showing top 2 rows
140166944013424
[3. 3. 3.]
The code that runs outside of the transform is ran in a different environment than the code within your transform. When you commit, you'll be running your checks which runs the code outside the transform to generate the jobspec which is technically your executable transform. You can find these within the "details" of your dataset after the checks pass.
The logic within your transform is then detached and runs in isolation each time you hit build. The global accum you define outside the transform is never ran and doesn't exist when the code inside the compute is running.
global accum <-- runs in checks
#transform_df(
Output("ri.foundry.main.dataset.c0d4fc0c-bb1d-4c7b-86ce-a13ec6666490"),
)
def compute(ctx):
bla bla some logic <-- runs during build
The prints you are doing during your second code example, happen after the df is processed, because you are asking spark to compute with the new.show(2, false). While the print you are doing in the first example happen before the df is processed, since the compute will only happen after your return df.
If you want to try to print after your df is computed, you can use #transform(... instead of #transform_df(... and do a print after writing the dataframe contents. Should be something like this:
#transform(
output=Output("ri.foundry.main.dataset.c0d4fc0c-bb1d-4c7b-86ce-a13ec6666490"),
)
def compute(ctx, output):
df = ... some logic ...
output.write_dataframe(df) # please check the function name I think it was write_dataframe, but may be wrong
print accum

Pyspark: add one row dynamically into the final dataframe

I've a final dataframe with this format:
Product_ID: string
Product_COD: string
Product_NAM: string
Product_VER: integer
ProductLine_NAM: string
Language_COD: string
ProductType_NAM: string
Load_DAT: integer
LoadEnd_DAT:integer
edmChange_DTT: timestamp
and I want to add a new row to that dataframe where the ID (Product_ID) is -1 and in the string columns insert 'Unknown' and in the remaining datatypes set to "null" for example:
I created this code:
id_column = "Product_ID"
df_lessOne = spark.createDataFrame(["-1"], "string").toDF(id_column) #create a new id_column row with -1
appended_df = finalDf.unionByName(df_lessOne, allowMissingColumns=True) #add the rest columns of dataframe with nulls
appended_df_filter = appended_df.filter(""+ id_column + " = '-1'")
columns = [item[0] for item in appended_df_filter.dtypes if item[1].startswith('string')] #select only string columns
# replace string columns with "Unknown"
for c_na in columns:
appended_df_filter = (appended_df_filter
.filter(""+ id_column + " = '-1'")
.withColumn(c_na, lit('Unknown'))
)
appended_df = appended_df.filter(""+ id_column + " <> '-1'")
dfs = [appended_df, appended_df_filter]
#add final -1 row to the final dataframe
finalDf = reduce(DataFrame.unionAll, dfs)
display(finalDf)
but unfortunately, it's not working well.
I'm trying to create this dynamically because after I want to use it in other dataframes. I just need to change the id_column after.
Can anyone please help me in achieving this
Thank you!
from pyspark.sql.types import *
from datetime import datetime
import pyspark.sql.functions as F
data2 = [
("xp3980","2103","Product_1",1,"PdLine_23","XX1","PNT_1",2,36636,datetime.strptime('2020-08-20 10:00:00', '%Y-%m-%d %H:%M:%S')),
("gi9387","2411","Product_2",1,"PdLine_44","YT89","PNT_6",2,35847,datetime.strptime('2021-07-21 7:00:00', '%Y-%m-%d %H:%M:%S'))
]
schema = StructType([ \
StructField("Product_ID",StringType(),True), \
StructField("Product_COD",StringType(),True), \
StructField("Product_NAM",StringType(),True), \
StructField("Product_VER", IntegerType(),True), \
StructField("ProductLine_NAM", StringType(), True), \
StructField("Language_COD", StringType(), True), \
StructField("ProductType_NAM", StringType(), True), \
StructField("Load_DAT", IntegerType(), True), \
StructField("LoadEnd_DAT", IntegerType(), True), \
StructField("edmChange_DTT", TimestampType(), True) \
])
my_df = spark.createDataFrame(data=data2,schema=schema)
df_res = spark.createDataFrame([(-1,)]).toDF("Product_ID")
for c in my_df.schema:
if str(c.name) == 'Product_ID':
continue
if str(c.dataType) == 'StringType':
df_res = df_res.withColumn(c.name, F.lit('Unknown'))
else:
df_res = df_res.withColumn(c.name, F.lit(None))
my_df.union(df_res).show()
+----------+-----------+-----------+-----------+---------------+------------+---------------+--------+-----------+-------------------+
# |Product_ID|Product_COD|Product_NAM|Product_VER|ProductLine_NAM|Language_COD|ProductType_NAM|Load_DAT|LoadEnd_DAT| edmChange_DTT|
# +----------+-----------+-----------+-----------+---------------+------------+---------------+--------+-----------+-------------------+
# | xp3980| 2103| Product_1| 1| PdLine_23| XX1| PNT_1| 2| 36636|2020-08-20 10:00:00|
# | gi9387| 2411| Product_2| 1| PdLine_44| YT89| PNT_6| 2| 35847|2021-07-21 07:00:00|
# | -1| Unknown| Unknown| null| Unknown| Unknown| Unknown| null| null| null|
# +----------+-----------+-----------+-----------+---------------+------------+---------------+--------+-----------+-------------------+

PySpark row to struct with specified structure

This is my initial dataframe:
columns = ["CounterpartID","Year","Month","Day","churnprobability", "deadprobability"]
data = [(1234, 2021,5,12, 0.85,0.6),(1224, 2022,6,12, 0.75,0.6),(1345, 2022,5,13, 0.8,0.2),(234, 2021,7,12, 0.9,0.8)]
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
schema = StructType([
StructField("client_id", IntegerType(), False),
StructField("year", IntegerType(), False),
StructField("month", IntegerType(), False),
StructField("day", IntegerType(), False),
StructField("churn_probability", DoubleType(), False),
StructField("dead_probability", DoubleType(), False)
])
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)
Then I do some transformations on the columns (basically, separating out the float columns into before decimals and after decimals columns) to get the intermediary dataframe.
abc = df.rdd.map(lambda x: (x[0],x[1],x[2],x[3],int(x[4]),int(x[4]%1 * pow(10,9)), int(x[5]),int(x[5]%1 * pow(10,9)) )).toDF(['client_id','year', 'month', 'day', 'churn_probability_unit', 'churn_probability_nano', 'dead_probability_unit', 'dead_probability_nano'] )
display(abc)
Below is the final desired dataframe (this is just an example of one row, but of course I'll need all the rows from the intermediary dataframe.
sjson = {"clientId": {"id": 1234 },"eventDate": {"year": 2022,"month": 8,"day": 5},"churnProbability": {"rate": {"units": "500","nanos": 780000000}},"deadProbability": {"rate": {"units": "500","nanos": 780000000}}}
df = spark.read.json(sc.parallelize([sjson])).select("clientId", "eventDate", "churnProbability", "deadProbability")
display(df)
How do I reach this end state from the intermediary state efficiently for all rows?
End goal is to use this final dataframe to write to Kafka where the schema of the topic is a form of the final desired dataframe.
I would probably eliminate the use of rdd logic (and again toDF) by using just one select from your original df:
from pyspark.sql import functions as F
defg = df.select(
F.struct(F.col('client_id').alias('id')).alias('clientId'),
F.struct('year', 'month', 'day').alias('eventDate'),
F.struct(
F.struct(
F.floor('churn_probability').alias('unit'),
(F.col('churn_probability') % 1 * 10**9).cast('long').alias('nanos')
).alias('rate')
).alias('churnProbability'),
F.struct(
F.struct(
F.floor('dead_probability').alias('unit'),
(F.col('dead_probability') % 1 * 10**9).cast('long').alias('nanos')
).alias('rate')
).alias('deadProbability'),
)
defg.show()
# +--------+-------------+----------------+----------------+
# |clientId| eventDate|churnProbability| deadProbability|
# +--------+-------------+----------------+----------------+
# | {1234}|{2021, 5, 12}|{{0, 850000000}}|{{0, 600000000}}|
# | {1224}|{2022, 6, 12}|{{0, 750000000}}|{{0, 600000000}}|
# | {1345}|{2022, 5, 13}|{{0, 800000000}}|{{0, 200000000}}|
# | {234}|{2021, 7, 12}|{{0, 900000000}}|{{0, 800000000}}|
# +--------+-------------+----------------+----------------+
So, I was able to solve this using structs , without using to_json
import pyspark.sql.functions as f
defg = abc.withColumn(
"clientId",
f.struct(
f.col("client_id").
alias("id")
)).withColumn(
"eventDate",
f.struct(
f.col("year").alias("year"),
f.col("month").alias("month"),
f.col("day").alias("day"),
)
).withColumn(
"churnProbability",
f.struct( f.struct(
f.col("churn_probability_unit").alias("unit"),
f.col("churn_probability_nano").alias("nanos")
).alias("rate")
)
).withColumn(
"deadProbability",
f.struct( f.struct(
f.col("dead_probability_unit").alias("unit"),
f.col("dead_probability_nano").alias("nanos")
).alias("rate")
)
).select ("clientId","eventDate","churnProbability", "deadProbability" )

Dynamic dictionary in pyspark

I am trying to build a dictionary dynamically using pyspark, by reading the table structure on the oracle database. Here's a simplified version of my code
predefined dictionary (convert_dict.py)
conversions = {
"COL1": lambda c: f.col(c).cast("string"),
"COL2": lambda c: f.from_unixtime(f.unix_timestamp(c, dateFormat)).cast("date"),
"COL3": lambda c: f.from_unixtime(f.unix_timestamp(c, dateFormat)).cast("date"),
"COL4": lambda c: f.col(c).cast("float")
}
Main program
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType
from convert_dict import conversions
spark = SparkSession.builder.appName("file_testing").getOrCreate()
table_name = "TEST_TABLE"
input_file_path = "file:\\\c:\Desktop\foo.txt"
sql_query = "(select listagg(column_name,',') within group(order by column_id) col from user_tab_columns where " \
"table_name = '" + table_name + "' and column_name not in ('COL10', 'COL11','COL12') order by column_id) table_columns"
struct_schema = StructType([\
StructField("COL1", StringType(), True),\
StructField("COL2", StringType(), True),\
StructField("COL3", StringType(), True),\
StructField("COL4", StringType(), True),\
])
data_df = spark.read.schema(struct_schema).option("sep", ",").option("header", "true").csv(input_file_path)
validdateData = lines.withColumn(
"dataTypeValidations",
f.concat_ws(",",
*[
f.when(
v(k).isNull() & f.col(k).isNotNull(),
f.lit(k + " not valid")
).otherwise(f.lit("None"))
for k,v in conversions.items()
]
)
)
data_temp = validdateData
for k,v in conversions.items():
data_temp = data_temp.withColumn(k,v(k))
validateData.show()
spark.stop()
If I am to change the above code to dynamically generate the dictionary from database
DATEFORMAT = "yyyyMMdd"
dict_sql = """
(select column_name,case when data_type = 'VARCHAR2' then 'string' when data_type in ( 'DATE','TIMESTAMP(6)') then 'date' when data_type = 'NUMBER' and NVL(DATA_SCALE,0) <> 0 then 'float' when data_type = 'NUMBER' and NVL(DATA_SCALE,0) = 0 then 'int'
end d_type from user_tab_columns where table_name = 'TEST_TABLE' and column_name not in ('COL10', 'COL11','COL12')) dict
"""
column_df = spark.read.format("jdbc").option("url",url).option("dbtable", dict_sql)\
.option("user",user).option("password",password).option("driver",driver).load()
conversions = {}
for row in column_df.rdd.collect():
column_name = row.COLUMN_NAME
column_type = row.D_TYPE
if column_type == "date":
conversions.update({column_name: lambda c:f.col(c)})
elif column_type == "float":
conversions.update({column_name: lambda c: f.col(c).cast("float")})
elif column_type == "date":
conversions.update({column_name: lambda c: f.from_unixtime(f.unix_timestamp(c, DATEFORMAT)).cast("date")})
elif column_type == "int":
conversions.update({column_name: lambda c: f.col(c).cast("int")})
else:
conversions.update({column_name: lambda c: f.col(c)})
The conversion of data-types doesn't work when the above dynamically generated dictionary is used. For example: if "COL2" contains "20210731", the resulting data from the above code stays the same, i.e. doesn't get converted to the correct date format. Where as the predefined dictionary works in correct manner.
Am I missing something here or is there a better way to implement dynamically generated dictionaries in pyspark?
Had a rookie mistake in my code, in the if-then-else block, I had two separate statements for column_type == "date"

PySpark: data doesn't always conform to schema - logic to alter data

I'm new with PySpark and am working on a script, reading from .csv files.
I've explicitly defined the schema in the below & the script works perfectly...most of the time.
The issue is, on occasion, a value enters the files which does not conform to the schema - e.g. '-' might appear in an integer field & hence, we get a type error - the error is thrown when df1.show() is reached in the script.
I'm trying to think of a way to effectively say - if the value does not match the defined datatype, then replace with ''
Does anyone know if this may be possible? Any advice would be great!
from pyspark.sql import SparkSession
import pyspark.sql.functions as sqlfunc
from pyspark.sql.types import *
import argparse, sys
from pyspark.sql import *
from pyspark.sql.functions import *
from datetime import datetime
#create a context that supports hive
def create_session(appname):
spark_session = SparkSession\
.builder\
.appName(appname)\
.master('yarn')\
.config("hive.metastore.uris", "thrift://serverip:9083")\
.enableHiveSupport()\
.getOrCreate()
return spark_session
### START MAIN ###
if __name__ == '__main__':
spark_session = create_session('testing_files')
dt_now = datetime.now()
today_unixtime = long(dt_now.strftime('%s'))
today_date = datetime.fromtimestamp(today_unixtime).strftime('%Y%m%d')
twoday_unixtime = long(dt_now.strftime('%s')) - 24*60*60*2
twoday = datetime.fromtimestamp(twoday_unixtime).strftime('%Y%m%d')
hourago = long(dt_now.strftime('%s')) - 60*60*4
hrdate = datetime.fromtimestamp(hourago).strftime('%H')
schema = [\
StructField('field1', StringType(), True),\
StructField('field2',StringType(), True), \
StructField('field3',IntegerType(), True) \
]
final_structure = StructType(schema)
df1 = spark_session.read\
.option("header","false")\
.option("delimiter", "\t")\
.csv('hdfs://hdfspath/dt=%s/*/*/*' %today_date, final_structure)
usercatschema = [\
StructField('field1', StringType(), True),\
StructField('field2',StringType(), True), \
StructField('field3',StringType(), True) \
]
usercat_structure = StructType(usercatschema)
df2 = spark_session.read\
.option("header","false")\
.option("delimiter", "\t")\
.csv('hdfs://hdfspath/v0/dt=%s/*' %twoday, usercat_structure)
df1.show()
df2.show()
df1.createOrReplaceTempView("dpi")
df2.createOrReplaceTempView("usercat")
finaldf = spark_session.sql('''
SQL QUERY
''')
finaldf.coalesce(10).write.format("com.databricks.spark.csv").option("header", "true").option('sep', '\t').mode('append').save('hdfs://hdfs path')
Read it as String type and then convert to int.
df.withColumn("field3",df.field3.cast("int"))

Resources