I try to show maximum value from column while I group rows by date column.
So i tried this code
maxVal = dfSelect.select('*')\
.groupBy('DATE')\
.agg(max('CLOSE'))
But output looks like that:
+----------+----------+
| DATE|max(CLOSE)|
+----------+----------+
|1987-05-08| 43.51|
|1987-05-29| 39.061|
+----------+----------+
I wanna have output like below
+------+---+----------+------+------+------+------+------+---+----------+
|TICKER|PER| DATE| TIME| OPEN| HIGH| LOW| CLOSE|VOL|max(CLOSE)|
+------+---+----------+------+------+------+------+------+---+----------+
| CDG| D|1987-01-02|000000|50.666|51.441|49.896|50.666| 0| 50.666|
| ABC| D|1987-01-05|000000|51.441| 52.02|51.441|51.441| 0| 51.441|
+------+---+----------+------+------+------+------+------+---+----------+
So my question is how to change the code to have output with all columns and aggregated 'CLOSE' column?
Scheme of my data looks like below:
root
|-- TICKER: string (nullable = true)
|-- PER: string (nullable = true)
|-- DATE: date (nullable = true)
|-- TIME: string (nullable = true)
|-- OPEN: float (nullable = true)
|-- HIGH: float (nullable = true)
|-- LOW: float (nullable = true)
|-- CLOSE: float (nullable = true)
|-- VOL: integer (nullable = true)
|-- OPENINT: string (nullable = true)
If you want the same aggregation all your columns in the original dataframe, then you can do something like,
import pyspark.sql.functions as F
expr = [F.max(coln).alias(coln) for coln in df.columns if 'date' not in coln] # df is your datafram
df_res = df.groupby('date').agg(*expr)
If you want multiple aggregations, then you can do like,
sub_col1 = # define
sub_col2=# define
expr1 = [F.max(coln).alias(coln) for coln in sub_col1 if 'date' not in coln]
expr2 = [F.first(coln).alias(coln) for coln in sub_col2 if 'date' not in coln]
expr=expr1+expr2
df_res = df.groupby('date').agg(*expr)
If you want only one of the columns aggregated and added to your original dataframe, then you can do a selfjoin after aggregating
df_agg = df.groupby('date').agg(F.max('close').alias('close_agg')).withColumn("dummy",F.lit("dummmy")) # dummy column is needed as a workaround in spark issues of self join
df_join = df.join(df_agg,on='date',how='left')
or you can use a windowing function
from pyspark.sql import Window
w= Window.partitionBy('date')
df_res = df.withColumn("max_close",F.max('close').over(w))
I have a list as shown below:
It is of the type as shown below:
[(key1, [(key11, value11), (key12, value12)]), (key2, [(key21, value21), (key22, value22)...])...]
A sample structure is shown below:
[('1052762305',
[('1007819788', 0.9206884810054885),
('1005886801', 0.913818268123084),
('1003863766', 0.9131746152849486),
('1007811435', 0.9128666156173751),
('1005879599', 0.9126368405937075),
('1003705572', 0.9122051062936369),
('1007804896', 0.9083424459788203),
('1005890270', 0.8982097535650703),
('1007806781', 0.8708761186829758),
('1003670458', 0.8452789033694487)]),
('1064808607',
[('1007804896', 0.9984397647563017),
('1003705572', 0.9970498347406341),
('1005879599', 0.9951581013190172),
('1007811435', 0.9934813787902085),
('1005886801', 0.9930572794622374),
('1003863766', 0.9928815742735568),
('1007819788', 0.9869723713790797),
('1005890270', 0.9642640856016443),
('1007806781', 0.9211558765137313),
('1003670458', 0.8519872445941068)])]
I want to convert this into a dataframe of the form
key1 key2 score
1052762305 1007819788 0.9206884810054885
1052762305 1005886801 0.913818268123084
1052762305 1003863766 0.9131746152849486
... ... ...
1064808607 1007804896 0.9984397647563017
1064808607 1003705572 0.9970498347406341
1064808607 1005879599 0.9951581013190172
... ... ...
How can we implement this in pyspark?
You can create a schema upfront with the input. Use explode and access the elements with in the value struct.
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField,StringType,ArrayType, DoubleType
spark = SparkSession.builder \
.appName('SO')\
.getOrCreate()
schema = StructType([StructField("key1",StringType()), StructField("value",ArrayType(
StructType([ StructField("key2", StringType()),
StructField("score", DoubleType())])
)) ])
df = spark.createDataFrame(
[('1052762305',
[('1007819788', 0.9206884810054885),
('1005886801', 0.913818268123084),
('1003863766', 0.9131746152849486),
('1007811435', 0.9128666156173751),
('1005879599', 0.9126368405937075),
('1003705572', 0.9122051062936369),
('1007804896', 0.9083424459788203),
('1005890270', 0.8982097535650703),
('1007806781', 0.8708761186829758),
('1003670458', 0.8452789033694487)]),
('1064808607',
[('1007804896', 0.9984397647563017),
('1003705572', 0.9970498347406341),
('1005879599', 0.9951581013190172),
('1007811435', 0.9934813787902085),
('1005886801', 0.9930572794622374),
('1003863766', 0.9928815742735568),
('1007819788', 0.9869723713790797),
('1005890270', 0.9642640856016443),
('1007806781', 0.9211558765137313),
('1003670458', 0.8519872445941068)])
],schema
)
df.show()
+----------+--------------------+
| key1| value |
+----------+--------------------+
|1052762305|[[1007819788, 0.9...|
|1064808607|[[1007804896, 0.9...|
+----------+--------------------+
df.printSchema()
root
|-- key1: string (nullable = true)
|-- value: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key2: string (nullable = true)
| | |-- score: double (nullable = true)
df1=df.select('key1',F.explode('value').alias('value'))
df1.show()
+----------+--------------------+
| key1| value |
+----------+--------------------+
|1052762305|[1007819788, 0.92...|
|1052762305|[1005886801, 0.91...|
|1052762305|[1003863766, 0.91...|
|1052762305|[1007811435, 0.91...|
|1052762305|[1005879599, 0.91...|
|1052762305|[1003705572, 0.91...|
|1052762305|[1007804896, 0.90...|
|1052762305|[1005890270, 0.89...|
|1052762305|[1007806781, 0.87...|
|1052762305|[1003670458, 0.84...|
|1064808607|[1007804896, 0.99...|
|1064808607|[1003705572, 0.99...|
|1064808607|[1005879599, 0.99...|
|1064808607|[1007811435, 0.99...|
|1064808607|[1005886801, 0.99...|
|1064808607|[1003863766, 0.99...|
|1064808607|[1007819788, 0.98...|
|1064808607|[1005890270, 0.96...|
|1064808607|[1007806781, 0.92...|
|1064808607|[1003670458, 0.85...|
+----------+--------------------+
df1.printSchema()
root
|-- key1: string (nullable = true)
|-- value: struct (nullable = true)
| |-- key2: string (nullable = true)
| |-- score: double (nullable = true)
df1.select("key1", "value.key2","value.score").show()
+----------+----------+------------------+
| key1| key2| score|
+----------+----------+------------------+
|1052762305|1007819788|0.9206884810054885|
|1052762305|1005886801| 0.913818268123084|
|1052762305|1003863766|0.9131746152849486|
|1052762305|1007811435|0.9128666156173751|
|1052762305|1005879599|0.9126368405937075|
|1052762305|1003705572|0.9122051062936369|
|1052762305|1007804896|0.9083424459788203|
|1052762305|1005890270|0.8982097535650703|
|1052762305|1007806781|0.8708761186829758|
|1052762305|1003670458|0.8452789033694487|
|1064808607|1007804896|0.9984397647563017|
|1064808607|1003705572|0.9970498347406341|
|1064808607|1005879599|0.9951581013190172|
|1064808607|1007811435|0.9934813787902085|
|1064808607|1005886801|0.9930572794622374|
|1064808607|1003863766|0.9928815742735568|
|1064808607|1007819788|0.9869723713790797|
|1064808607|1005890270|0.9642640856016443|
|1064808607|1007806781|0.9211558765137313|
|1064808607|1003670458|0.8519872445941068|
You basically need to do following:
create a dataframe from your list
promote the pairs from elements of array into a separate row by using explode
extract key & value from pair via select
This could be done by something like this (source data is in the variable called a):
from pyspark.sql.functions import explode, col
df = spark.createDataFrame(a, ['key1', 'val'])
df2 = df.select(col('key1'), explode(col('val')).alias('val'))
df3 = df2.select('key1', col('val')._1.alias('key2'), col('val')._2.alias('value'))
we can check that schema & data is matching:
>>> df3.printSchema()
root
|-- key1: string (nullable = true)
|-- key2: string (nullable = true)
|-- value: double (nullable = true)
>>> df3.show(2)
+----------+----------+------------------+
| key1| key2| value|
+----------+----------+------------------+
|1052762305|1007819788|0.9206884810054885|
|1052762305|1005886801| 0.913818268123084|
+----------+----------+------------------+
only showing top 2 rows
we can also check the schema for intermediate results:
>>> df.printSchema()
root
|-- key1: string (nullable = true)
|-- val: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _1: string (nullable = true)
| | |-- _2: double (nullable = true)
>>> df2.printSchema()
root
|-- key1: string (nullable = true)
|-- val: struct (nullable = true)
| |-- _1: string (nullable = true)
| |-- _2: double (nullable = true)
I want to remove barcode from this array.
My dataframe looks like the sample given below,
|-- variants: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- admin_graphql_api_id: string (nullable = true)
| | |-- barcode: string (nullable = true)
| | |-- compare_at_price: string (nullable = true)
Can you help me to remove the element from the dataframe using PySpark.
You can use arrays_zip:
from pyspark.sql.types import ArrayType, StringType, StructType, StructField
df = df.withColumn("variants", F.arrays_zip("variants.admin_graphql_api_id", "variants.compare_at_price"))
df = df.withColumn("variants", F.col("variants").cast(schema))
df.printSchema()
prints
root
|-- variants: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- admin_graphql_api_id: string (nullable = true)
| | |-- compare_at_price: string (nullable = true)
The second withColumn is necessary to set the field names of the new struct.
arrays_zip is only available for Spark version >= 2.4.0. If you are using an older Spark version, you could use an UDF:
def func(array):
return [[x.admin_graphql_api_id, x.compare_at_price] for x in array]
func_udf = F.udf(func, schema)
df = df.withColumn("variants", func_udf("variants"))
I want to create a dummy dataframe with one row which has Decimal values in it. But when do so it automatically converts it to a double. I want the data type to be Decimal(18,2) or etc.
dummy_row = spark.createDataFrame([(0,-1,'missing','missing',0.0],df.columns)
I want the schema to be
unique_id:integer
line_id:long
line_name:string
line_type:string
pct:decimal(18,5)
But I get
unique_id:integer
line_id:long
line_name:string
line_type:string
pct:double
How do I cast the double to decimal type in PySpark?
There are 2 ways.
My preferred way is to cast decimal type after the dataframe is created.
It is safest to provide values as strings.
df = spark.createDataFrame([("92.34567890123456789", )], ["col_1"])
df = df.withColumn("col_1", F.col("col_1").cast("decimal(30,20)"))
df.show(truncate=False)
df.printSchema()
# +-----------------------+
# |col_1 |
# +-----------------------+
# |92.34567890123456789000|
# +-----------------------+
#
# root
# |-- col_1: decimal(30,20) (nullable = true)
If values were provided as numbers, Python may "truncate" your values, because it would first create double precision floating point numbers (16-17 significant digits) out of what was provided in the code. Note how the last digits 789 would disappear:
df = spark.createDataFrame([(92.34567890123456789, )], ["col_1"])
df = df.withColumn("col_2", F.col("col_1").cast("decimal(30,20)"))
df.show(truncate=False)
df.printSchema()
# +-----------------+-----------------------+
# |col_1 |col_2 |
# +-----------------+-----------------------+
# |92.34567890123456|92.34567890123456000000|
# +-----------------+-----------------------+
#
# root
# |-- col_1: double (nullable = true)
# |-- col_2: decimal(30,20) (nullable = true)
The other way is to create decimal numbers beforehand.
from decimal import Context
dec_number = Context(prec=38).create_decimal('92.34567890123456789')
df = spark.createDataFrame([(dec_number, )], 'col_1 decimal(30,20)')
df.show(truncate=False)
df.printSchema()
# +-----------------------+
# |col_1 |
# +-----------------------+
# |92.34567890123456789000|
# +-----------------------+
#
# root
# |-- col_1: decimal(30,20) (nullable = true)
38 is the maximum decimal precision which is supported in Spark.
So, in your case you could:
1.1. - create dataframe from scratch - casting to decimal after df is created
dummy_row = spark.createDataFrame(
[(0, -1, 'missing', 'missing', '0.0')],
"unique_id:int, line_id:long, line_name:string, line_type:string, pct:string")
dummy_row = dummy_row.withColumn("pct", F.col("pct").cast("decimal(18,5)"))
dummy_row.show(truncate=False)
dummy_row.printSchema()
# +---------+-------+---------+---------+-------+
# |unique_id|line_id|line_name|line_type|pct |
# +---------+-------+---------+---------+-------+
# |0 |-1 |missing |missing |0.00000|
# +---------+-------+---------+---------+-------+
#
# root
# |-- unique_id: integer (nullable = true)
# |-- line_id: long (nullable = true)
# |-- line_name: string (nullable = true)
# |-- line_type: string (nullable = true)
# |-- pct: decimal(18,5) (nullable = true)
1.2. - create dataframe from scratch - using Python's decimal module
from decimal import Context
dec_number = Context(prec=38).create_decimal('0.0')
dummy_row = spark.createDataFrame(
[(0, -1, 'missing', 'missing', dec_number)],
"unique_id:int, line_id:long, line_name:string, line_type:string, pct:decimal(18,5)")
dummy_row.show(truncate=False)
dummy_row.printSchema()
# +---------+-------+---------+---------+-------+
# |unique_id|line_id|line_name|line_type|pct |
# +---------+-------+---------+---------+-------+
# |0 |-1 |missing |missing |0.00000|
# +---------+-------+---------+---------+-------+
#
# root
# |-- unique_id: integer (nullable = true)
# |-- line_id: long (nullable = true)
# |-- line_name: string (nullable = true)
# |-- line_type: string (nullable = true)
# |-- pct: decimal(18,5) (nullable = true)
2.1. - borrow schema from existing df (it seems you originally wanted this) - casting to decimal after df is created
df = spark.createDataFrame(
[],
"unique_id:int, line_id:long, line_name:string, line_type:string, pct:decimal(18,5)")
dummy_row = spark.createDataFrame([(0, -1, 'missing', 'missing', '0.0')], df.columns)
dummy_row = dummy_row.withColumn("pct", F.col("pct").cast("decimal(18,5)"))
dummy_row.show(truncate=False)
dummy_row.printSchema()
# +---------+-------+---------+---------+-------+
# |unique_id|line_id|line_name|line_type|pct |
# +---------+-------+---------+---------+-------+
# |0 |-1 |missing |missing |0.00000|
# +---------+-------+---------+---------+-------+
#
# root
# |-- unique_id: long (nullable = true)
# |-- line_id: long (nullable = true)
# |-- line_name: string (nullable = true)
# |-- line_type: string (nullable = true)
# |-- pct: decimal(18,5) (nullable = true)
2.2. - borrow schema from existing df (it seems you originally wanted this) - using Python's decimal module
from decimal import Context
df = spark.createDataFrame(
[],
"unique_id:int, line_id:long, line_name:string, line_type:string, pct:decimal(18,5)")
dec_number = Context(prec=38).create_decimal('0.0')
dummy_row = spark.createDataFrame([(0, -1, 'missing', 'missing', dec_number)], df.schema)
dummy_row.show(truncate=False)
dummy_row.printSchema()
# +---------+-------+---------+---------+-------+
# |unique_id|line_id|line_name|line_type|pct |
# +---------+-------+---------+---------+-------+
# |0 |-1 |missing |missing |0.00000|
# +---------+-------+---------+---------+-------+
#
# root
# |-- unique_id: integer (nullable = true)
# |-- line_id: long (nullable = true)
# |-- line_name: string (nullable = true)
# |-- line_type: string (nullable = true)
# |-- pct: decimal(18,5) (nullable = true)