I have two spark dataframes:
df1 = sc.parallelize([
['a', '1', 'value1'],
['b', '1', 'value2'],
['c', '2', 'value3'],
['d', '4', 'value4'],
['e', '2', 'value5'],
['f', '4', 'value6']
]).toDF(('id1', 'id2', 'v1'))
df2 = sc.parallelize([
['a','1', 1],
['b','1', 1],
['y','2', 4],
['z','2', 4]
]).toDF(('id1', 'id2', 'v2'))
Each of them has fields id1 and id2 (and may contain a lot of id's).
At first, I need to match df1 with df2 by id1.
Then, I need to match all unmatched records from both dataframes by id2, etc.
My way is:
def joinA(df1,df2, field):
from pyspark.sql.functions import lit
L = 'L_'
R = 'R_'
Lfield = L+field
Rfield = R+field
# Taking field's names
df1n = df1.schema.names
df2n = df2.schema.names
newL = [L+fld for fld in df1n]
newR = [R+fld for fld in df2n]
# drop duplicates by input field
df1 = df1.toDF(*newL).dropDuplicates([Lfield])
df2 = df2.toDF(*newR).dropDuplicates([Rfield])
# matching records
df_full = df1.join(df2,df1[Lfield]==df2[Rfield],how = 'outer').cache()
# unmatched records from df1
df_left = df_full.where(df2[Rfield].isNull()).select(newL).toDF(*df1n)
# unmatched records from df2
df_right = df_full.where(df1[Lfield].isNull()).select(newR).toDF(*df2n)
# matched records and adding match level
df_inner = df_full.where(\
(~df1[Lfield].isNull()) & (~df2[Rfield].isNull())\
).withColumn('matchlevel',lit(field))
return df_left, df_inner, df_right
first_l,first_i,first_r = joinA(df1,df2,'id1')
second_l,second_i,second_r = joinA(first_l,first_r,'id2')
result = first_i.union(second_i)
Is there a way to make it easier?
Or some standard tools for this job?
Thank you,
Maks
I have another way to do it ... but I am not sure it is better than your solution :
from pyspark.sql import functions as F
id_cols = [cols for cols in df1.columns if cols != 'v1']
df1 = df1.withColumn("get_v2", F.lit(None))
df1 = df1.withColumn("match_level", F.lit(None))
for col in id_cols:
new_df1 = df1.join(
df2.select(
col,
"v2"
),
on=(
(df1[col] == df2[col])
& df1['get_v2'].isNull()
),
how='left'
)
new_df1 = new_df1.withColumn(
"get_v2",
F.coalesce(df1.get_v2, df2.v2)
).drop(df2[col]).drop(df2.v2)
new_df1 = new_df1.withColumn(
"match_level",
F.when(F.col("get_v2").isNotNull(), F.coalesce(F.col("match_level"), F.lit(col)))
)
df1 = new_df1
df1.show()
+---+---+---+------+------+-----------+
|id1|id2|id3| v1|get_v2|match_level|
+---+---+---+------+------+-----------+
| f| 4| 1|value6| 3| id3|
| d| 4| 1|value4| 3| id3|
| c| 2| 1|value3| 4| id2|
| c| 2| 1|value3| 4| id2|
| e| 2| 1|value5| 4| id2|
| e| 2| 1|value5| 4| id2|
| b| 1| 1|value2| 1| id1|
| a| 1| 1|value1| 1| id1|
+---+---+---+------+------+-----------+
this will result in N-joins where N is the number of ids you got.
EDIT : added match_level !
Related
I have data like in the dataframe below. As you can see, there are columns "2019" and "2019_p", "2020" and "2020_p", "2021" and "2021_p".
I want to select the final columns dynamically where if "2019" is null, take the value of "2019_p" and if the value of "2020" is null, take the value of "2020_p" and the same applies to "2021" etc.
I want to select the columns dynamically without hardcoding the column names.
How do I achieve this?
I need output like this:
you can simplify ZygD's approach to just use a list comprehension with coalesce (without regex).
# following list can be created from a source dataframe as well
year_cols = ['2019', '2020', '2021']
# [k for k in data_sdf.columns if k.startswith('20') and not k.endswith('_p')]
data_sdf. \
select('id', 'type',
*[func.coalesce(c, c+'_p').alias(c) for c in year_cols]
). \
show()
# +---+----+----+----+----+
# | id|type|2019|2020|2021|
# +---+----+----+----+----+
# | 1| A| 50| 65| 40|
# | 1| B| 25| 75| 75|
# +---+----+----+----+----+
where the list comprehension would yield the following
[func.coalesce(c, c+'_p').alias(c) for c in year_cols]
# [Column<'coalesce(2019, 2019_p) AS `2019`'>,
# Column<'coalesce(2020, 2020_p) AS `2020`'>,
# Column<'coalesce(2021, 2021_p) AS `2021`'>]
Input:
from pyspark.sql import functions as F
df = spark.createDataFrame(
[(1, 'A', 50, None, 40, None, 65, None),
(1, 'B', None, 75, None, 25, None, 75)],
['Id', 'Type', '2019', '2020', '2021', '2019_p', '2020_p', '2021_p'])
One way could be this - using df.colRegex:
cols = list({c[:4] for c in df.columns if c not in ['Id', 'Type']})
df = df.select(
'Id', 'Type',
*[F.coalesce(*df.select(df.colRegex(f'`^{c}.*`')).columns).alias(c) for c in cols]
)
df.show()
# +---+----+----+----+----+
# | Id|Type|2020|2019|2021|
# +---+----+----+----+----+
# | 1| A| 65| 50| 40|
# | 1| B| 75| 25| 75|
# +---+----+----+----+----+
Also possible using startswith:
cols = list({c[:4] for c in df.columns if c not in ['Id', 'Type']})
df = df.select(
'Id', 'Type',
*[F.coalesce(*[x for x in df.columns if x.startswith(c)]).alias(c) for c in cols]
)
If you needed a one liner, create a dictionary of the columns and use k, value pair in the coalesce
df.select( 'Id','Type',*[coalesce(k,v).alias(k) for k,v in dict(zip(df.select(df.colRegex("`\\d{4}`")).columns,df.select(df.colRegex("`.*\\_\\D$`")).columns)).items()]).show()
+---+----+----+----+----+
| Id|Type|2019|2020|2021|
+---+----+----+----+----+
| 1| A| 50| 65| 40|
| 1| B| 25| 75| 75|
+---+----+----+----+----+
I have to select a column out of two columns which has more data or values in it using PySpark and keep it in my DataFrame.
For example, we have two columns A and B:
In example, the column B has more values so I will keep it in my DF for transformations. Similarly, I would take A, if A had more values. I think we can do it using if else conditions, but I'm not able to get the correct logic.
You could first aggregate the columns (count the values in each). This way you will get just 1 row which you could extract as dictionary using .head().asDict(). Then use Python's max(your_dict, key=your_dict.get) to get the dict's key having the max value (i.e. the name of the column which has maximum number of values). Then just select this column.
Example input:
from pyspark.sql import functions as F
df = spark.createDataFrame([(1, 7), (2, 4), (3, 7), (None, 8), (None, 4)], ['A', 'B'])
df.show()
# +----+---+
# | A| B|
# +----+---+
# | 1| 7|
# | 2| 4|
# | 3| 7|
# |null| 8|
# |null| 4|
# +----+---+
Scalable script using built-in max:
val_cnt = df.agg(*[F.count(c).alias(c) for c in {'A', 'B'}]).head().asDict()
df = df.select(max(val_cnt, key=val_cnt.get))
df.show()
# +---+
# | B|
# +---+
# | 7|
# | 4|
# | 7|
# | 8|
# | 4|
# +---+
Script for just 2 columns (A and B):
head = df.agg(*[F.count(c).alias(c) for c in {'A', 'B'}]).head()
df = df.select('B' if head.B > head.A else 'A')
df.show()
# +---+
# | B|
# +---+
# | 7|
# | 4|
# | 7|
# | 8|
# | 4|
# +---+
Scalable script adjustable to more columns, without built-in max:
val_cnt = df.agg(*[F.count(c).alias(c) for c in {'A', 'B'}]).head().asDict()
key, val = '', -1
for k, v in val_cnt.items():
if v > val:
key, val = k, v
df = df.select(key)
df.show()
# +---+
# | B|
# +---+
# | 7|
# | 4|
# | 7|
# | 8|
# | 4|
# +---+
Create a data frame with the data
df = spark.createDataFrame(data=[(1,7),(2,4),(3,7),(4,8),(5,0),(6,0),(None,3),(None,5),(None,8),(None,4)],schema = ['A','B'])
Define a condition to check for that
from pyspark.sql.functions import *
import pyspark.sql.functions as fx
condition = fx.when((fx.col('A').isNotNull() & (fx.col('A')>fx.col('B'))),fx.col('A')).otherwise(fx.col('B'))
df_1 = df.withColumn('max_value_among_A_and_B',condition)
Print the dataframe
df_1.show()
Please check the below screenshot for details
or
If you want to pick up the whole column just based on the count. you can try this:
from pyspark.sql.functions import *
import pyspark.sql.functions as fx
df = spark.createDataFrame(data=[(1,7),(2,4),(3,7),(4,8),(5,0),(6,0),(None,3),(None,5),(None,8),(None,4)],schema = ['A','B'])
if df.select('A').count() > df.select('B').count():
pickcolumn = 'A'
else:
pickcolumn = 'B'
df_1 = df.withColumn('NewColumnm',col(pickcolumn)).drop('A','B')
df_1.show()
This is my dataframe I'm trying to drop the duplicate columns with same name using index:
df = spark.createDataFrame([(1,2,3,4,5)],['c','b','a','a','b'])
df.show()
Output:
+---+---+---+---+---+
| c| b| a| a| b|
+---+---+---+---+---+
| 1| 2| 3| 4| 5|
+---+---+---+---+---+
I got the index of the dataframe
col_dict = {x: col for x, col in enumerate(df.columns)}
col_dict
Output:
{0: 'c', 1: 'b', 2: 'a', 3: 'a', 4: 'b'}
Now i need to drop that duplicate column name with the same name
There is no method for droping columns using index. One way for achieving this is to rename the duplicate columns and then drop them.
Here is an example you can adapt:
df_cols = df.columns
# get index of the duplicate columns
duplicate_col_index = list(set([df_cols.index(c) for c in df_cols if df_cols.count(c) == 2]))
# rename by adding suffix '_duplicated'
for i in duplicate_col_index:
df_cols[i] = df_cols[i] + '_duplicated'
# rename the column in DF
df = df.toDF(*df_cols)
# remove flagged columns
cols_to_remove = [c for c in df_cols if '_duplicated' in c]
df.drop(*cols_to_remove).show()
+---+---+---+
| c| a| b|
+---+---+---+
| 1| 4| 5|
+---+---+---+
I want to generate a when clause based on values in a dict. Its very similar to what's being done How do I use multiple conditions with pyspark.sql.funtions.when()?
Only I want to pass a dict of cols and values
Let's say I have a dict:
{
'employed': 'Y',
'athlete': 'N'
}
I want to use that dict to generate the equivalent of:
df.withColumn("call_person",when((col("employed") == "Y") & (col("athlete") == "N"), "Y")
So the end result is:
+---+-----------+--------+-------+
| id|call_person|employed|athlete|
+---+-----------+--------+-------+
| 1| Y | Y | N |
| 2| N | Y | Y |
| 3| N | N | N |
+---+-----------+--------+-------+
Note part of the reason I want to do it programmatically is I have different length dicts (number of conditions)
Use reduce() function:
from functools import reduce
from pyspark.sql.functions import when, col
# dictionary
d = {
'employed': 'Y',
'athlete': 'N'
}
# set up the conditions, multiple conditions merged with `&`
cond = reduce(lambda x,y: x&y, [ col(c) == v for c,v in d.items() if c in df.columns ])
# set up the new column
df.withColumn("call_person", when(cond, "Y").otherwise("N")).show()
+---+--------+-------+-----------+
| id|employed|athlete|call_person|
+---+--------+-------+-----------+
| 1| Y| N| Y|
| 2| Y| Y| N|
| 3| N| N| N|
+---+--------+-------+-----------+
you can access dictionary items directly also:
dict ={
'code': 'b',
'amt': '4'
}
list = [(1, 'code'),(1,'amt')]
df=spark.createDataFrame(list, ['id', 'dict_key'])
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
user_func = udf (lambda x: dict.get(x), StringType())
newdf = df.withColumn('new_column',user_func(df.dict_key))
>>> newdf.show();
+---+--------+----------+
| id|dict_key|new_column|
+---+--------+----------+
| 1| code| b|
| 1| amt| 4|
+---+--------+----------+
or broadcasting a dictionary
broadcast_dict = sc.broadcast(dict)
def my_func(key):
return broadcast_dict.value.get(key)
new_my_func = udf(my_func, StringType())
newdf = df.withColumn('new_column',new_my_func(df.dict_key))
>>> newdf.show();
+---+--------+----------+
| id|dict_key|new_column|
+---+--------+----------+
| 1| code| b|
| 1| amt| 4|
+---+--------+----------+
df1 has fields id and json; df2 has fields idand json
df1.count() => 1200; df2.count() => 20
df1 has all the rows. df2 has an incremental update with just 20 rows.
My goal is to update df1 with the values from df2. All the ids of df2 are in df1. But df2 has updated values(in the json field) for those same ids.
Resulting df should have all the values from df1 and updated values from df2.
What is the best way to do this? - With the least number of joins and filters.
Thanks!
You can achieve this using one left join.
Create Example DataFrames
Using the sample data provided by #Shankar Koirala in his answer.
data1 = [
(1, "a"),
(2, "b"),
(3, "c")
]
df1 = sqlCtx.createDataFrame(data1, ["id", "value"])
data2 = [
(1, "x"),
(2, "y")
]
df2 = sqlCtx.createDataFrame(data2, ["id", "value"])
Do a left join
Join the two DataFrames using a left join on the id column. This will keep all of the rows in the left DataFrame. For the rows in the right DataFrame that don't have a matching id, the value will be null.
import pyspark.sql.functions as f
df1.alias('l').join(df2.alias('r'), on='id', how='left')\
.select(
'id',
f.col('l.value').alias('left_value'),
f.col('r.value').alias('right_value')
)\
.show()
#+---+----------+-----------+
#| id|left_value|right_value|
#+---+----------+-----------+
#| 1| a| x|
#| 3| c| null|
#| 2| b| y|
#+---+----------+-----------+
Select the desired data
We will use the fact that the unmatched ids have a null to select the final columns. Use pyspark.sql.functions.when() to use the right value if it is not null, otherwise keep the left value.
df1.alias('l').join(df2.alias('r'), on='id', how='left')\
.select(
'id',
f.when(
~f.isnull(f.col('r.value')),
f.col('r.value')
).otherwise(f.col('l.value')).alias('value')
)\
.show()
#+---+-----+
#| id|value|
#+---+-----+
#| 1| x|
#| 3| c|
#| 2| y|
#+---+-----+
You can sort this output if you want the ids in order.
Using pyspark-sql
You can do the same thing using a pyspark-sql query:
df1.registerTempTable('df1')
df2.registerTempTable('df2')
query = """SELECT l.id,
CASE WHEN r.value IS NOT NULL THEN r.value ELSE l.value END AS value
FROM df1 l LEFT JOIN df2 r ON l.id = r.id"""
sqlCtx.sql(query.replace("\n", "")).show()
#+---+-----+
#| id|value|
#+---+-----+
#| 1| x|
#| 3| c|
#| 2| y|
#+---+-----+
I would like to provide a slightly more general solution. What happens if the input data has 100 columns instead of 2? We would spend too much time making a coalesce of those 100 columns to keep the values on the right side of the left join.
Another way to solve this problem would be to "delete" the updated rows from the original df and finally make a union with the updated rows.
data_orginal = spark.createDataFrame([
(1, "a"),
(2, "b"),
(3, "c")
], ("id", "value"))
data_updated = spark.createDataFrame([
(1, "x"),
(2, "y")
], ("id", "value"))
data_orginal.show()
+---+-----+
| id|value|
+---+-----+
| 1| a|
| 2| b|
| 3| c|
+---+-----+
data_updated.show()
+---+-----+
| id|value|
+---+-----+
| 1| x|
| 2| y|
+---+-----+
data_orginal.createOrReplaceTempView("data_orginal")
data_updated.createOrReplaceTempView("data_updated")
src_data_except_updated = spark.sql(f"SELECT * FROM data_orginal WHERE id not in (1,2)")
result_data = src_data_except_updated.union(data_updated)
result_data.show()
+---+-----+
| id|value|
+---+-----+
| 3| c|
| 1| x|
| 2| y|
+---+-----+
Notice that the query
SELECT * FROM data_orginal WHERE id not in (1,2)
could be generated automatically:
ids_collect = spark.sql(f"SELECT id FROM data_updated").collect()
ids_list = [f"{x.id}" for x in ids_collect]
ids_str = ",".join(ids_list)
query_get_all_except = f"SELECT * FROM data_original WHERE id not in ({ids_str})"