How to merge two dataframes and return data from another column in new column only if there is match? - python-3.x

I have a two df that look like this:
df1:
id
1
2
df2:
id value
2 a
3 b
How do I merge these two dataframes and only return the data from value column in a new column if there is a match?
new_merged_df
id value new_value
1
2 a a
3 b

You can try this using #JJFord3 setup:
import pandas
df1 = pandas.DataFrame(index=[1,2])
df2 = pandas.DataFrame({'value' : ['a','b']},index=[2,3])
#Use isin to create new_value
df2['new_value'] = df2['value'].where(df2.index.isin(df1.index))
#Use reindex with union to rebuild dataframe with both indexes
df2.reindex(df1.index.union(df2.index))
Output:
value new_value
1 NaN NaN
2 a a
3 b NaN

import pandas
df1 = pandas.DataFrame(index=[1,2])
df2 = pandas.DataFrame({'value' : ['a','b']},index=[2,3])
new_merged_df_outer = df1.merge(df2,how='outer',left_index=True,right_index=True)
new_merged_df_inner = df1.merge(df2,how='inner',left_index=True,right_index=True)
new_merged_df_inner.rename(columns={'value':'new_value'})
new_merged_df = new_merged_df_outer.merge(new_merged_df_inner,how='left',left_index=True,right_index=True)
First, create an outer merge to keep all indexes.
Then create an inner merge to only get the overlap.
Then merge the inner merge back to the outer merge to get the desired column setup.

You can use full outer join
Lets model your data with case classes:
case class MyClass1(id: String)
case class MyClass2(id: String, value: String)
// this one for the result type
case class MyClass3(id: String, value: Option[String] = None, value2: Option[String] = None)
Creating some inputs:
val input1: Dataset[MyClass1] = ...
val input2: Dataset[MyClass2] = ...
Joining your data:
import scala.implicits._
val joined = input1.as("1").joinWith(input2.as("2"), $"1.id" === $"2.id", "full_outer")
joined map {
case (left, null) if left != null => MyClass3(left.id)
case (null, right) if right != null => MyClass3(right.id, Some(right.value))
case (left, right) => MyClass3(left.id, Some(right.value), Some(right.value))
}

DataFrame.merge has in parameter indicator which
If True, adds a column to output DataFrame called “_merge” with information on the source of each row.
This can be used to check if there is a match
import pandas as pd
df1 = pd.DataFrame(index=[1,2])
df2 = pd.DataFrame({'value' : ['a','b']},index=[2,3])
# creates a new column `_merge` with values `right_only`, `left_only` or `both`
merged = df1.merge(df2, how='outer', right_index=True, left_index=True, indicator=True)
merged['new_value'] = merged.loc[(merged['_merge'] == 'both'), 'value']
merged = merged.drop('_merge', axis=1)

Use merge and isin:
df = df1.merge(df2,on='id',how='outer')
id_value = df2.loc[df2['id'].isin(df1.id.tolist()),'id'].unique()
mask = df['id'].isin(id_value)
df.loc[mask,'new_value'] = df.loc[mask,'value']
# alternative df['new_value'] = np.where(mask, df['value'], np.nan)
print(df)
id value new_value
0 1 NaN NaN
1 2 a a
2 3 b NaN

Related

how to traverse a pandas dataframe to form a nested json?

I have a pandas dataframe with the following structure. It can created using the following code
import pandas as pd
import numpy as np
word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start, level_1_end )),
columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
I want to traverse the dataframe into a JSON. The output should look like the one below:
{
"vehicle": {
"car":{
"tyre": True,
"steering": True
"clutch": True
},
"bus":{
"break": True
}
},
"animal": {
"dog":{
"leg": True
}
}
}
What is the best way to achieve this in pandas?
You are capturing more information than required. end columns are not needed.
remove rows that have nothing in them dropna()
forward fill the tags and remove < and > from string
use a comprehension to build the dictionary from dataframe to_dict()
df = pd.DataFrame({"word":["this","is","a","test","call","this","is","a","test","call","this","is","a","test","call"],
"level_3_start":["","","<tyre>","<steering>","","","","","<leg>","","<clutch>","","","<break>",""],
"level_3_end":["","","</tyre>","","</steering>","","","","</leg>","","","","</clutch>","</break>",""],
"level_2_start":["","","<car>","","","","","","<dog>","","<car>","","","<bus>",""],
"level_2_end":["","","","","</car>","","","","</dog>","","","","</car>","</bus>",""],
"level_1_start":["","","<vehicle>","","","","","","<animal>","","<vehicle>","","","",""],
"level_1_end":["","","","","","","</vehicle>","","</animal>","","","","","</vehicle>",""]})
# cleanup
df = df.replace({"":np.nan}).dropna(subset=[c for c in df.columns if c!="word"], how="all")
for c in [c for c in df.columns if "start" in c]:
df[c].fillna(method="ffill", inplace=True)
df[c] = df[c].str.replace("<","")
df[c] = df[c].str.replace(">","")
dfd = df.loc[:,[c for c in df.columns if "level" in c]].drop_duplicates().to_dict(orient="records")
{d["level_1_start"]:
{d2["level_2_start"]:
{d3["level_3_start"]:True
for d3 in dfd if d3["level_1_start"]==d["level_1_start"] and d3["level_2_start"]==d2["level_2_start"]
}
for d2 in dfd if d2["level_1_start"]==d["level_1_start"]
}
for d in dfd
}
output
{'vehicle': {'car': {'tyre': True, 'steering': True, 'clutch': True},
'bus': {'break': True}},
'animal': {'dog': {'leg': True}}}
To get the final results, your data has to go through a 3 step process:
step 1: remove all columns that are not required for processing
step 2: clean data to remove tags and sort them in level_1, level_2, level_3 order
step 3: create the nested dictionary
Here's how I have done it. Commented each section to show clearly what we are doing.
import pandas as pd
import numpy as np
import collections
word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]
df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start, level_1_end )),
columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])
#creating df_temp for processing
df_temp = df1
#drop columns that are not important for this problem statement
df_temp = df_temp.drop(columns=['word','level_1_end','level_2_end','level_3_end'])
#remove all < and >
df_temp['level_1_start'] = df_temp['level_1_start'].str.replace("<","").str.replace(">","")
df_temp['level_2_start'] = df_temp['level_2_start'].str.replace("<","").str.replace(">","")
df_temp['level_3_start'] = df_temp['level_3_start'].str.replace("<","").str.replace(">","")
#drop all rows that don't have any value
df_temp.dropna(how='all', inplace = True)
#forwardfill all level_1 columns
df_temp['level_1_start'] = df_temp['level_1_start'].ffill()
#drop rows that have no data in level_2 and level_3
df_temp = df_temp.dropna(subset=['level_3_start','level_2_start'],how='all')
#forwardfill all level_2_start columns
df_temp['level_2_start'] = df_temp['level_2_start'].ffill()
#drop rows that have no data in level_3
df_temp = df_temp.dropna(subset=['level_3_start'],how='all')
#now we have the all data ready for processing
#sort them in level_1, level_2, level_3 order
df_temp = df_temp.sort_values(by=['level_1_start', 'level_2_start','level_3_start'])
#to create nested dictionary, you need to use collections.defaultdict
df_dict = collections.defaultdict(dict)
#iterate through the dataframe. each row will have a unique record for level_3
for idx,row in df_temp.iterrows():
lev_1 = row['level_1_start']
lev_2 = row['level_2_start']
lev_3 = row['level_3_start']
#if level_1 does not exist, create new entry for level_1, level_2, & level_3 (ex: animal does not exist)
#if level_1 exists but no level_2, create new entry for level_2 & level_3 (ex: car does not exist but bus exists)
#if level_1 and level 2 exists, then create a new entry for level 3 (ex: vehicle, car exists, but tyre does not)
if lev_1 in df_dict:
if lev_2 in df_dict[lev_1]:
df_dict[lev_1][lev_2][lev_3] = True
else:
df_dict[lev_1][lev_2] = {lev_3:True}
else:
df_dict[lev_1] = {lev_2 : {lev_3:True}}
#convert collection back to normal dictionary
df_dict = dict(df_dict)
print(df_dict)
Output will be as follows:
{'animal':
{'dog': {'leg': True}
},
'vehicle':
{'bus': {'break': True},
'car': {'clutch': True, 'steering': True, 'tyre': True}
}
}

Pandas filter through dataframe and compute stats

I'm trying to access certain categories of data and do stat computation.
A B C Type
0 1.539708 -1.166480 0.533026 foo
1 1.302092 -0.505754 0.533026 foo
2 -0.371983 1.104803 -0.651520 bar
3 -1.309622 1.118697 -1.161657 bar
4 -1.924296 0.396437 0.812436 baz
Expected output (I've left the data blank below, however the actual program will have correct output.):
user_input = input('Select type: ') <-----user input foo
Mean 25% Median
A
B
C
So far I'm able to create a function to caclulate mean, 25% and median for the whole dataframe using below,
def stat(df):
mean = df[['A','B','C']].mean()
quantile = df[['A','B','C']].quantile(0.25)
median = df[['A','B','C']].median()
df1 = mean.rename('Mean').to_frame()
df2 = quantile.rename('25%').to_frame()
df3 = median.rename('Median').to_frame()
df = df1.join([df2,df3])
return df
What I'm lacking is to have the option to select particular type in column Type and still producing the same outcome as stat function. Can anyone gives hint?
You just need to do some boolean indexing with .loc for the Type column:
user_input = input('Select type: ')
def stat(df, Type):
mean = df.loc[(df['Type'] == Type), ['A','B','C']].mean()
quantile = df.loc[(df['Type'] == Type), ['A','B','C']].quantile(0.25)
median = df.loc[(df['Type'] == Type), ['A','B','C']].median()
df1 = mean.rename('Mean').to_frame()
df2 = quantile.rename('25%').to_frame()
df3 = median.rename('Median').to_frame()
df = df1.join([df2,df3])
return df
For example, this is how it would look like if you filter row-wise if the user_input is foo
stat(df, user_input)
Out[1]:
Mean 25% Median
A 1.420900 1.361496 1.420900
B -0.836117 -1.001298 -0.836117
C 0.533026 0.533026 0.533026

Filter dataframe based on groupby sum()

I want to filter my dataframe based on a groupby sum(). I am looking for lines where the amounts for a spesific date, gets to zero.
I have solve this by creating a for loop. I suspect this will reduce performance if the dataframe is large.
It also seems clunky.
newdf = pd.DataFrame()
newdf['name'] = ('leon','eurika','monica','wian')
newdf['surname'] = ('swart','swart','swart','swart')
newdf['birthdate'] = ('14051981','198001','20081012','20100621')
newdf['tdate'] = ('13/05/2015','14/05/2015','15/05/2015', '13/05/2015')
newdf['tamount'] = (100.10, 111.11, 123.45, -100.10)
df = newdf.groupby(['tdate'])[['tamount']].sum().reset_index()
df2 = df.loc[df["tamount"] == 0, "tdate"]
df3 = pd.DataFrame()
for i in df2:
df3 = df3.append(newdf.loc[newdf["tdate"] == i])
print (df3)
The below code is creating an output of the two lines getting to zero when combined on tamount
name surname birthdate tdate tamount
0 leon swart 1981-05-14 13/05/2015 100.1
3 wian swart 2010-06-21 13/05/2015 -100.1
Just use basic numpy :)
import numpy as np
df = newdf.groupby(['tdate'])[['tamount']].sum().reset_index()
dates = df['tdate'][np.where(df['tamount'] == 0)[0]]
newdf[np.isin(newdf['tdate'], dates) == True]
Hope this helps; let me know if you have any questions.

How to check NULL values while comparing 2 text files using spark data frames

The below code failing to capture the 'null' value records. From below df1, the column NO . 5 has a null value (name field).
As per my below requirement OutputDF, the No. 5 record should come as mentioned. But after below code execution this record is not coming into the final output. The records with 'null' values are not coming into the output. Except this, remaining everything fine.
df1
NO DEPT NAME SAL
1 IT RAM 1000
2 IT SRI 600
3 HR GOPI 1500
5 HW 700
df2
NO DEPT NAME SAL
1 IT RAM 1000
2 IT SRI 900
4 MT SUMP 1200
5 HW MAHI 700
OutputDF
NO DEPT NAME SAL FLAG
1 IT RAM 1000 SAME
2 IT SRI 900 UPDATE
4 MT SUMP 1200 INSERT
3 HR GOPI 1500 DELETE
5 HW MAHI 700 UPDATE
from pyspark.shell import spark
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
sc = spark.sparkContext
filedf1 = spark.read.option("header","true").option("delimiter", ",").csv("C:\\files\\file1.csv")
filedf2 = spark.read.option("header","true").option("delimiter", ",").csv("C:\\files\\file2.csv")
filedf1.createOrReplaceTempView("table1")
filedf2.createOrReplaceTempView("table2")
df1 = spark.sql( "select * from table1" )
df2 = spark.sql( "select * from table2" )
#DELETE
df_d = df1.join(df2, df1.NO == df2.NO, 'left').filter(F.isnull(df2.NO)).select(df1.NO,df1.DEPT,df1.NAME,df1.SAL, F.lit('DELETE').alias('FLAG'))
print("df_d left:",df_d.show())
#INSERT
df_i = df1.join(df2, df1.NO == df2.NO, 'right').filter(F.isnull(df1.NO)).select(df2.NO,df2.DEPT,df2.NAME,df2.SAL, F.lit('INSERT').alias('FLAG'))
print("df_i right:",df_i.show())
#SAME
df_s = df1.join(df2, df1.NO == df2.NO, 'inner').filter(F.concat(df2.NO,df2.DEPT,df2.NAME,df2.SAL) == F.concat(df1.NO,df1.DEPT,df1.NAME,df1.SAL)).select(df1.NO,df1.DEPT,df1.NAME,df1.SAL, F.lit('SAME').alias('FLAG'))
print("df_s inner:",df_s.show())
#UPDATE
df_u = df1.join(df2, df1.NO == df2.NO, 'inner').filter(F.concat(df2.NO,df2.DEPT,df2.NAME,df2.SAL) != F.concat(df1.NO,df1.DEPT,df1.NAME,df1.SAL)).select(df2.NO,df2.DEPT,df2.NAME,df2.SAL, F.lit('UPDATE').alias('FLAG'))
print("df_u inner:",df_u.show())
df = df_d.union(df_i).union(df_s).union(df_u)
df.show()
Here i'm comparing both df1 and df2, if found new records in df2 taking flag as INSERT, if record is same in both dfs then taking as SAME, if the record is in df1 and not in df2 taking as DELETE and if the record exist in both dfs but with different values then taking df2 values as UPDATE.
There's two issues with the code:
The result of F.concat of a null returns null, so this part in code filters out row row NO 5:
.filter(F.concat(df2.NO, df2.NAME, df2.SAL) != F.concat(df1.NO, df1.NAME, df1.SAL))
You are only selecting df2. It's fine in the example case above, but if your df2 has a null then the resultant dataframe will have null.
You can try concatenating it with a udf below:
def concat_cols(row):
concat_row = ''.join([str(col) for col in row if col is not None])
return concat_row
udf_concat_cols = udf(concat_cols, StringType())
The function concat_row can be broken down into two parts:
"".join([mylist]) is a string function. It joins everything in the
list with the defined delimeter, in this case it's an empty string.
[str(col) for col in row if col is not None] is a list comprehension, it does as it reads: for each column in the row, if
the column is not None, then append the str(col) into the list.
List comprehension is just a more pythonic way of doing this:
mylist = []
for col in row:
if col is not None:
mylist.append(col))
You can replace your update code as:
df_u = (df1
.join(df2, df1.NO == df2.NO, 'inner')
.filter(udf_concat_cols(struct(df1.NO, df1.NAME, df1.SAL)) != udf_concat_cols(struct(df2.NO, df2.NAME, df2.SAL)))
.select(coalesce(df1.NO, df2.NO),
coalesce(df1.NAME, df2.NAME),
coalesce(df1.SAL, df2.SAL),
F.lit('UPDATE').alias('FLAG')))
You should do something similar for your #SAME flag and break the line for readability.
Update:
If df2 always have the correct (updated) result, there is no need to coalesce.
The code for this instance would be:
df_u = (df1
.join(df2, df1.NO == df2.NO, 'inner')
.filter(udf_concat_cols(struct(df1.NO, df1.NAME, df1.SAL)) != udf_concat_cols(struct(df2.NO, df2.NAME, df2.SAL)))
.select(df2.NO,
df2.NAME,
df2.SAL,
F.lit('UPDATE').alias('FLAG')))

Spark: Merge 2 dataframes by adding row index/number on both dataframes

Q: Is there is any way to merge two dataframes or copy a column of a dataframe to another in PySpark?
For example, I have two Dataframes:
DF1
C1 C2
23397414 20875.7353
5213970 20497.5582
41323308 20935.7956
123276113 18884.0477
76456078 18389.9269
the seconde dataframe
DF2
C3 C4
2008-02-04 262.00
2008-02-05 257.25
2008-02-06 262.75
2008-02-07 237.00
2008-02-08 231.00
Then i want to add C3 of DF2 to DF1 like this:
New DF
C1 C2 C3
23397414 20875.7353 2008-02-04
5213970 20497.5582 2008-02-05
41323308 20935.7956 2008-02-06
123276113 18884.0477 2008-02-07
76456078 18389.9269 2008-02-08
I hope this example was clear.
rownum + window function i.e solution 1 or zipWithIndex.map i.e solution 2 should help in this case.
Solution 1 : You can use window functions to get this kind of
Then I would suggest you to add rownumber as additional column name to Dataframe say df1.
DF1
C1 C2 columnindex
23397414 20875.7353 1
5213970 20497.5582 2
41323308 20935.7956 3
123276113 18884.0477 4
76456078 18389.9269 5
the second dataframe
DF2
C3 C4 columnindex
2008-02-04 262.00 1
2008-02-05 257.25 2
2008-02-06 262.75 3
2008-02-07 237.00 4
2008-02-08 231.00 5
Now .. do inner join of df1 and df2 that's all...
you will get below ouput
something like this
from pyspark.sql.window import Window
from pyspark.sql.functions import rowNumber
w = Window().orderBy()
df1 = .... // as showed above df1
df2 = .... // as shown above df2
df11 = df1.withColumn("columnindex", rowNumber().over(w))
df22 = df2.withColumn("columnindex", rowNumber().over(w))
newDF = df11.join(df22, df11.columnindex == df22.columnindex, 'inner').drop(df22.columnindex)
newDF.show()
New DF
C1 C2 C3
23397414 20875.7353 2008-02-04
5213970 20497.5582 2008-02-05
41323308 20935.7956 2008-02-06
123276113 18884.0477 2008-02-07
76456078 18389.9269 2008-02-08
Solution 2 : Another good way(probably this is best :)) in scala, which you can translate to pyspark :
/**
* Add Column Index to dataframe
*/
def addColumnIndex(df: DataFrame) = sqlContext.createDataFrame(
// Add Column index
df.rdd.zipWithIndex.map{case (row, columnindex) => Row.fromSeq(row.toSeq :+ columnindex)},
// Create schema
StructType(df.schema.fields :+ StructField("columnindex", LongType, false))
)
// Add index now...
val df1WithIndex = addColumnIndex(df1)
val df2WithIndex = addColumnIndex(df2)
// Now time to join ...
val newone = df1WithIndex
.join(df2WithIndex , Seq("columnindex"))
.drop("columnindex")
I thought I would share the python (pyspark) translation for answer #2 above from #Ram Ghadiyaram:
from pyspark.sql.functions import col
def addColumnIndex(df):
# Create new column names
oldColumns = df.schema.names
newColumns = oldColumns + ["columnindex"]
# Add Column index
df_indexed = df.rdd.zipWithIndex().map(lambda (row, columnindex): \
row + (columnindex,)).toDF()
#Rename all the columns
new_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx],
newColumns[idx]), xrange(len(oldColumns)), df_indexed)
return new_df
# Add index now...
df1WithIndex = addColumnIndex(df1)
df2WithIndex = addColumnIndex(df2)
#Now time to join ...
newone = df1WithIndex.join(df2WithIndex, col("columnindex"),
'inner').drop("columnindex")
for python3 version,
from pyspark.sql.types import StructType, StructField, LongType
def with_column_index(sdf):
new_schema = StructType(sdf.schema.fields + [StructField("ColumnIndex", LongType(), False),])
return sdf.rdd.zipWithIndex().map(lambda row: row[0] + (row[1],)).toDF(schema=new_schema)
df1_ci = with_column_index(df1)
df2_ci = with_column_index(df2)
join_on_index = df1_ci.join(df2_ci, df1_ci.ColumnIndex == df2_ci.ColumnIndex, 'inner').drop("ColumnIndex")
I referred to his(#Jed) answer
from pyspark.sql.functions import col
def addColumnIndex(df):
# Get old columns names and add a column "columnindex"
oldColumns = df.columns
newColumns = oldColumns + ["columnindex"]
# Add Column index
df_indexed = df.rdd.zipWithIndex().map(lambda (row, columnindex): \
row + (columnindex,)).toDF()
#Rename all the columns
oldColumns = df_indexed.columns
new_df = reduce(lambda data, idx:data.withColumnRenamed(oldColumns[idx],
newColumns[idx]), xrange(len(oldColumns)), df_indexed)
return new_df
# Add index now...
df1WithIndex = addColumnIndex(df1)
df2WithIndex = addColumnIndex(df2)
#Now time to join ...
newone = df1WithIndex.join(df2WithIndex, col("columnindex"),
'inner').drop("columnindex")
This answer solved it for me:
import pyspark.sql.functions as sparkf
# This will return a new DF with all the columns + id
res = df.withColumn('id', sparkf.monotonically_increasing_id())
Credit to Arkadi T
Here is an simple example that can help you even if you have already solve the issue.
//create First Dataframe
val df1 = spark.sparkContext.parallelize(Seq(1,2,1)).toDF("lavel1")
//create second Dataframe
val df2 = spark.sparkContext.parallelize(Seq((1.0, 12.1), (12.1, 1.3), (1.1, 0.3))). toDF("f1", "f2")
//Combine both dataframe
val combinedRow = df1.rdd.zip(df2.rdd). map({
//convert both dataframe to Seq and join them and return as a row
case (df1Data, df2Data) => Row.fromSeq(df1Data.toSeq ++ df2Data.toSeq)
})
// create new Schema from both the dataframe's schema
val combinedschema = StructType(df1.schema.fields ++ df2.schema.fields)
// Create a new dataframe from new row and new schema
val finalDF = spark.sqlContext.createDataFrame(combinedRow, combinedschema)
finalDF.show
Expanding on Jed's answer, in response to Ajinkya's comment:
To get the same old column names, you need to replace "old_cols" with a column list of the newly named indexed columns. See my modified version of the function below
def add_column_index(df):
new_cols = df.schema.names + ['ix']
ix_df = df.rdd.zipWithIndex().map(lambda (row, ix): row + (ix,)).toDF()
tmp_cols = ix_df.schema.names
return reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx], new_cols[idx]), xrange(len(tmp_cols)), ix_df)
Not the better way performance wise.
df3=df1.crossJoin(df2).show(3)
To merge columns from two different dataframe you have first to create a column index and then join the two dataframes. Indeed, two dataframes are similar to two SQL tables. To make a connection you have to join them.
If you don't care about the final order of the rows you can generate the index column with monotonically_increasing_id().
Using the following code you can check that monotonically_increasing_id generates the same index column in both dataframes (at least up to a billion of rows), so you won't have any error in the merged dataframe.
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
sample_size = 1E9
sdf1 = spark.range(1, sample_size).select(F.col("id").alias("id1"))
sdf2 = spark.range(1, sample_size).select(F.col("id").alias("id2"))
sdf1 = sdf1.withColumn("idx", sf.monotonically_increasing_id())
sdf2 = sdf2.withColumn("idx", sf.monotonically_increasing_id())
sdf3 = sdf1.join(sdf2, 'idx', 'inner')
sdf3 = sdf3.withColumn("diff", F.col("id1")-F.col("id2")).select("diff")
sdf3.filter(F.col("diff") != 0 ).show()
You can use a combination of monotonically_increasing_id (guaranteed to always be increasing) and row_number (guaranteed to always give the same sequence). You cannot use row_number alone because it needs to be ordered by something. So here we order by monotonically_increasing_id. I am using Spark 2.3.1 and Python 2.7.13.
from pandas import DataFrame
from pyspark.sql.functions import (
monotonically_increasing_id,
row_number)
from pyspark.sql import Window
DF1 = spark.createDataFrame(DataFrame({
'C1': [23397414, 5213970, 41323308, 123276113, 76456078],
'C2': [20875.7353, 20497.5582, 20935.7956, 18884.0477, 18389.9269]}))
DF2 = spark.createDataFrame(DataFrame({
'C3':['2008-02-04', '2008-02-05', '2008-02-06', '2008-02-07', '2008-02-08']}))
DF1_idx = (
DF1
.withColumn('id', monotonically_increasing_id())
.withColumn('columnindex', row_number().over(Window.orderBy('id')))
.select('columnindex', 'C1', 'C2'))
DF2_idx = (
DF2
.withColumn('id', monotonically_increasing_id())
.withColumn('columnindex', row_number().over(Window.orderBy('id')))
.select('columnindex', 'C3'))
DF_complete = (
DF1_idx
.join(
other=DF2_idx,
on=['columnindex'],
how='inner')
.select('C1', 'C2', 'C3'))
DF_complete.show()
+---------+----------+----------+
| C1| C2| C3|
+---------+----------+----------+
| 23397414|20875.7353|2008-02-04|
| 5213970|20497.5582|2008-02-05|
| 41323308|20935.7956|2008-02-06|
|123276113|18884.0477|2008-02-07|
| 76456078|18389.9269|2008-02-08|
+---------+----------+----------+

Resources