Compare two DataFrames and check for changes - apache-spark

I have 2 similar Spark Dataframes df1 and df2 that I want to compare for changes:
df1 and df2 share the same columns
df2 can have more rows than df1 but any additional rows in df2 which is not in df1 can be ignored when comparing
Comparaison key columns are PROGRAM_NAME and ACTION
df1 = spark.createDataFrame([
["PROG1","ACTION1","10","NEW"],
["PROG2","ACTION2","12","NEW"],
["PROG3","ACTION1","14","NEW"],
["PROG4","ACTION4","16","NEW"]
],["PROGRAM_NAME", "ACTION", "VALUE1", "STATUS"])
df2 = spark.createDataFrame([
["PROG1","ACTION1","11","IN PROGRESS"],
["PROG2","ACTION2","12","NEW"],
["PROG3","ACTION1","20","FINISHED"],
["PROG4","ACTION4","14","IN PROGRESS"],
["PROG5","ACTION1","20","NEW"]
],["PROGRAM_NAME", "ACTION", "VALUE1", "STATUS"])
Showing below in order df1, df2 and the expected result I want after comparing the 2 DataFrames.

Similar questions have been asked multiple times here in SO.
Use a simple join to get rows that are in df1 and df2 and filter on those that have different values for the 2 other columns:
from pyspark.sql.functions import col
df_final = df2.alias("new").join(
df1.alias("old"),
(col("new.PROGRAM_NAME") == col("old.PROGRAM_NAME")) & (col("new.ACTION") == col("old.ACTION"))
).filter(
(col("new.VALUE1") != col("old.VALUE1")) | (col("new.STATUS") != col("old.STATUS"))
).select("new.*")
df_final.show()
#+------------+-------+------+-----------+
#|PROGRAM_NAME| ACTION|VALUE1| STATUS|
#+------------+-------+------+-----------+
#| PROG3|ACTION1| 20| FINISHED|
#| PROG4|ACTION4| 14|IN PROGRESS|
#| PROG1|ACTION1| 11|IN PROGRESS|
#+------------+-------+------+-----------+
You can also add the filter condition directly to the join condition

You can achieve the result like this:
import pandas as pd
dict1 = {"PROGRAM_NAME":["PROG1","PROG2","PROG3","PROG4"],
"ACTION":["ACTION1","ACTION2","ACTION1","ACTION4"],
"Value1":[10,12,14,16],
"Status":["NEW","NEW","NEW","NEW"]}
dict2 = {"PROGRAM_NAME":["PROG1","PROG2","PROG3","PROG4","PROG5"],
"ACTION":["ACTION1","ACTION2","ACTION1","ACTION4","ACTION1"],
"Value1":[11,12,20,14,20],
"Status":["IN PROGRES","NEW","FINISHED","IN PROGRES","NEW"]}
DF1 = pd.DataFrame(dict1)
DF2 = pd.DataFrame(dict2)
DF3 = DF2.copy()
DF3 = DF3[DF3["PROGRAM_NAME"].isin(DF1["PROGRAM_NAME"])]
Output:

You can merge df1 and df2 and retain VALUE1 and STATUS just of df2
df1
df2
Keeping suffixes of columns of df1 as _x and that of df2 as blank and then retaining just the columns of df2
df1.merge(df2, on=['PROGRAM_NAME', 'ACTION'], suffixes=('_x', ''))[df2.columns]

Here's the solution in Spark:
import pyspark.sql.types as T
import pyspark.sql.functions as F
df1 = spark.createDataFrame(
[
('PROG1', 'ACTION1', 10, 'NEW'),
('PROG2', 'ACTION2', 12, 'NEW'),
('PROG3', 'ACTION1', 14, 'NEW'),
('PROG4', 'ACTION4', 16, 'NEW'),
],
['PROGRAM_NAME', 'ACTION', 'Value1', 'Status']
)
df2 = spark.createDataFrame(
[
('PROG1', 'ACTION1', 11, 'IN PROGRESS'),
('PROG2', 'ACTION2', 12, 'NEW'),
('PROG3', 'ACTION1', 20, 'FINISHED'),
('PROG4', 'ACTION4', 14, 'IN PROGRESS'),
('PROG5', 'ACTION1', 20, 'NEW'),
],
['PROGRAM_NAME', 'ACTION', 'Value1', 'Status']
)
df1 = df1.alias('df1')
df2 = df2.alias('df2')
df = df1.join(df2, on=['PROGRAM_NAME', 'ACTION'], how='inner')
df = df.filter(F.col('df1.Status') != F.col('df2.Status'))
df.select(
F.col('PROGRAM_NAME'),
F.col('ACTION'),
*[F.col(f'df2.{col}') for col in df2.columns[2:]]
)

Related

PySpark row to struct with specified structure

This is my initial dataframe:
columns = ["CounterpartID","Year","Month","Day","churnprobability", "deadprobability"]
data = [(1234, 2021,5,12, 0.85,0.6),(1224, 2022,6,12, 0.75,0.6),(1345, 2022,5,13, 0.8,0.2),(234, 2021,7,12, 0.9,0.8)]
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
schema = StructType([
StructField("client_id", IntegerType(), False),
StructField("year", IntegerType(), False),
StructField("month", IntegerType(), False),
StructField("day", IntegerType(), False),
StructField("churn_probability", DoubleType(), False),
StructField("dead_probability", DoubleType(), False)
])
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)
Then I do some transformations on the columns (basically, separating out the float columns into before decimals and after decimals columns) to get the intermediary dataframe.
abc = df.rdd.map(lambda x: (x[0],x[1],x[2],x[3],int(x[4]),int(x[4]%1 * pow(10,9)), int(x[5]),int(x[5]%1 * pow(10,9)) )).toDF(['client_id','year', 'month', 'day', 'churn_probability_unit', 'churn_probability_nano', 'dead_probability_unit', 'dead_probability_nano'] )
display(abc)
Below is the final desired dataframe (this is just an example of one row, but of course I'll need all the rows from the intermediary dataframe.
sjson = {"clientId": {"id": 1234 },"eventDate": {"year": 2022,"month": 8,"day": 5},"churnProbability": {"rate": {"units": "500","nanos": 780000000}},"deadProbability": {"rate": {"units": "500","nanos": 780000000}}}
df = spark.read.json(sc.parallelize([sjson])).select("clientId", "eventDate", "churnProbability", "deadProbability")
display(df)
How do I reach this end state from the intermediary state efficiently for all rows?
End goal is to use this final dataframe to write to Kafka where the schema of the topic is a form of the final desired dataframe.
I would probably eliminate the use of rdd logic (and again toDF) by using just one select from your original df:
from pyspark.sql import functions as F
defg = df.select(
F.struct(F.col('client_id').alias('id')).alias('clientId'),
F.struct('year', 'month', 'day').alias('eventDate'),
F.struct(
F.struct(
F.floor('churn_probability').alias('unit'),
(F.col('churn_probability') % 1 * 10**9).cast('long').alias('nanos')
).alias('rate')
).alias('churnProbability'),
F.struct(
F.struct(
F.floor('dead_probability').alias('unit'),
(F.col('dead_probability') % 1 * 10**9).cast('long').alias('nanos')
).alias('rate')
).alias('deadProbability'),
)
defg.show()
# +--------+-------------+----------------+----------------+
# |clientId| eventDate|churnProbability| deadProbability|
# +--------+-------------+----------------+----------------+
# | {1234}|{2021, 5, 12}|{{0, 850000000}}|{{0, 600000000}}|
# | {1224}|{2022, 6, 12}|{{0, 750000000}}|{{0, 600000000}}|
# | {1345}|{2022, 5, 13}|{{0, 800000000}}|{{0, 200000000}}|
# | {234}|{2021, 7, 12}|{{0, 900000000}}|{{0, 800000000}}|
# +--------+-------------+----------------+----------------+
So, I was able to solve this using structs , without using to_json
import pyspark.sql.functions as f
defg = abc.withColumn(
"clientId",
f.struct(
f.col("client_id").
alias("id")
)).withColumn(
"eventDate",
f.struct(
f.col("year").alias("year"),
f.col("month").alias("month"),
f.col("day").alias("day"),
)
).withColumn(
"churnProbability",
f.struct( f.struct(
f.col("churn_probability_unit").alias("unit"),
f.col("churn_probability_nano").alias("nanos")
).alias("rate")
)
).withColumn(
"deadProbability",
f.struct( f.struct(
f.col("dead_probability_unit").alias("unit"),
f.col("dead_probability_nano").alias("nanos")
).alias("rate")
)
).select ("clientId","eventDate","churnProbability", "deadProbability" )

iterate over a column check condition and carry calculations with values of other data frames

import pandas as pd
import numpy as np
I do have 3 dataframes df1, df2 and df3.
df1=
data = {'Period': ['2024-04-O1', '2024-07-O1', '2024-10-O1', '2025-01-O1', '2025-04-O1', '2025-07-O1', '2025-10-O1', '2026-01-O1', '2026-04-O1', '2026-07-O1', '2026-10-O1', '2027-01-O1', '2027-04-O1', '2027-07-O1', '2027-10-O1', '2028-01-O1', '2028-04-O1', '2028-07-O1', '2028-10-O1'],
'Price': ['NaN','NaN','NaN','NaN', 'NaN','NaN','NaN','NaN', 'NaN','NaN','NaN','NaN',
'NaN','NaN','NaN','NaN', 'NaN','NaN','NaN'],
'years': [2024,2024,2024,2025,2025,2025,2025,2026,2026,2026,2026,2027,2027,2027,2027,2028,
2028,2028,2028],
'quarters':[2,3,4, 1,2,3,4, 1,2,3,4, 1,2,3,4, 1,2,3,4]
}
df1 = pd.DataFrame(data=data)
df2=
data = {'price': [473.26,244,204,185, 152, 157],
'year': [2023, 2024, 2025, 2026, 2027, 2028]
}
df3 = pd.DataFrame(data=data)
df3=
data = {'quarters': [1,2,3,4],
'weights': [1.22, 0.81, 0.83, 1.12]
}
df2 = pd.DataFrame(data=data)
My aim is to compute the price of df1. For each iteration through df1 check condition and carry calculations accordingly. For example for the 1st iteration, check if df1['year']=2024 and df1['quarters']=2. Then df1['price']=df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==2, 'weights'].
===>>> df1['price'][0]=**473.26*0.81**.
df1['price'][1]=**473.26*0.83**.
...
...
...
and so on.
I could ha used this method but i want to write a code in a more efficient way. I would like to use the following code structure.
for i in range(len(df1)):
if (df1['year']==2024) & (df1['quarter']==2):
df1['Price']= df2.loc[df2['year']==2024, 'price'] * df3.loc[df3['quarters']==2, 'weights']
elif (df1['year']==2024) & (df1['quarter']==3):
df1['price']= df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==3, 'weights']
elif (df1['year']==2024) & (df1['quarters']==4):
df1['Price']= df2.loc[df2['year']=='2024', 'price'] * df3.loc[df3['quarters']==4, 'weights']
...
...
...
Thanks!!!
I think if I understand correctly you can use pd.merge to bring these fields together first.
df1 = df1.merge(df2, how='left' , left_on='years', right_on='year')
df1 = df1.merge(df3, how='left' , left_on='quarters', right_on='quarters')
df1['Price'] = df1['price']*df1['weights']

Explode array values into multiple columns using PySpark

I am new to pyspark and I want to explode array values in such a way that each value gets assigned to a new column. I tried using explode but I couldn't get the desired output. Below is my output
this is the code
from pyspark.sql import *
from pyspark.sql.functions import explode
if __name__ == "__main__":
spark = SparkSession.builder \
.master("local[3]") \
.appName("DataOps") \
.getOrCreate()
dataFrameJSON = spark.read \
.option("multiLine", True) \
.option("mode", "PERMISSIVE") \
.json("data.json")
dataFrameJSON.printSchema()
sub_DF = dataFrameJSON.select(explode("values.line").alias("new_values"))
sub_DF.printSchema()
sub_DF2 = sub_DF.select("new_values.*")
sub_DF2.printSchema()
sub_DF.show(truncate=False)
new_DF = sub_DF2.select("id", "period.*", "property")
new_DF.show(truncate=False)
new_DF.printSchema()
this is data:
{
"values" : {
"line" : [
{
"id" : 1,
"period" : {
"start_ts" : "2020-01-01T00:00:00",
"end_ts" : "2020-01-01T00:15:00"
},
"property" : [
{
"name" : "PID",
"val" : "P120E12345678"
},
{
"name" : "EngID",
"val" : "PANELID00000000"
},
{
"name" : "TownIstat",
"val" : "12058091"
},
{
"name" : "ActiveEng",
"val" : "5678.1"
}
]
}
}
Could you include the data instead of screenshots ?
Meanwhile, assuming that df is the dataframe being used, what we need to do, is to create a new dataframe, while exrtracting the vals from the previous property array to new columns, and droping the property column at last :
from pyspark.sql.functions import col
output_df = df.withColumn("PID", col("property")[0].val).withColumn("EngID", col("property")[1].val).withColumn("TownIstat", col("property")[2].val).withColumn("ActiveEng", col("property")[3].val).drop("property")
In case the elementwas of type ArrayType use the following :
from pyspark.sql.functions import col
output_df = df.withColumn("PID", col("property")[0][1]).withColumn("EngID", col("property")[1][1]).withColumn("TownIstat", col("property")[2][1]).withColumn("ActiveEng", col("property")[3][1]).drop("property")
Explode will explode the arrays into new Rows, not columns, see this : pyspark explode
This is a general solution and works even when the JSONs are messy (different ordering of elements or if some of the elements are missing)
You got to flatten first, regexp_replace to split the 'property' column and finally pivot. This also avoids hard coding of the new column names.
Constructing your dataframe:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import *
schema = StructType([StructField("id", IntegerType()), StructField("start_ts", StringType()), StructField("end_ts", StringType()), \
StructField("property", ArrayType(StructType( [StructField("name", StringType()), StructField("val", StringType())] )))])
data = [[1, "2010", "2020", [["PID", "P123"], ["Eng", "PA111"], ["Town", "999"], ["Act", "123.1"]]],\
[2, "2011", "2012", [["PID", "P456"], ["Eng", "PA222"], ["Town", "777"], ["Act", "234.1"]]]]
df = spark.createDataFrame(data,schema=schema)
df.show(truncate=False)
+---+--------+------+------------------------------------------------------+
|id |start_ts|end_ts|property |
+---+--------+------+------------------------------------------------------+
|1 |2010 |2020 |[[PID, P123], [Eng, PA111], [Town, 999], [Act, 123.1]]|
|2 |2011 |2012 |[[PID, P456], [Eng, PA222], [Town, 777], [Act, 234.1]]|
+---+--------+------+------------------------------------------------------+
Flattening and pivoting:
df_flatten = df.rdd.flatMap(lambda x: [(x[0],x[1], x[2], y) for y in x[3]]).toDF(['id', 'start_ts', 'end_ts', 'property'])\
.select('id', 'start_ts', 'end_ts', col("property").cast("string"))
df_split = df_flatten.select('id', 'start_ts', 'end_ts', regexp_replace(df_flatten.property, "[\[\]]", "").alias("replacced_col"))\
.withColumn("arr", split(col("replacced_col"), ", "))\
.select(col("arr")[0].alias("col1"), col("arr")[1].alias("col2"), 'id', 'start_ts', 'end_ts')
final_df = df_split.groupby(df_split.id,)\
.pivot("col1")\
.agg(first("col2"))\
.join(df,'id').drop("property")
Output:
final_df.show()
+---+-----+-----+----+----+--------+------+
| id| Act| Eng| PID|Town|start_ts|end_ts|
+---+-----+-----+----+----+--------+------+
| 1|123.1|PA111|P123| 999| 2010| 2020|
| 2|234.1|PA222|P456| 777| 2011| 2012|
+---+-----+-----+----+----+--------+------+

Spark: Merge 2 dataframes by adding row index/number on both dataframes

Q: Is there is any way to merge two dataframes or copy a column of a dataframe to another in PySpark?
For example, I have two Dataframes:
DF1
C1 C2
23397414 20875.7353
5213970 20497.5582
41323308 20935.7956
123276113 18884.0477
76456078 18389.9269
the seconde dataframe
DF2
C3 C4
2008-02-04 262.00
2008-02-05 257.25
2008-02-06 262.75
2008-02-07 237.00
2008-02-08 231.00
Then i want to add C3 of DF2 to DF1 like this:
New DF
C1 C2 C3
23397414 20875.7353 2008-02-04
5213970 20497.5582 2008-02-05
41323308 20935.7956 2008-02-06
123276113 18884.0477 2008-02-07
76456078 18389.9269 2008-02-08
I hope this example was clear.
rownum + window function i.e solution 1 or zipWithIndex.map i.e solution 2 should help in this case.
Solution 1 : You can use window functions to get this kind of
Then I would suggest you to add rownumber as additional column name to Dataframe say df1.
DF1
C1 C2 columnindex
23397414 20875.7353 1
5213970 20497.5582 2
41323308 20935.7956 3
123276113 18884.0477 4
76456078 18389.9269 5
the second dataframe
DF2
C3 C4 columnindex
2008-02-04 262.00 1
2008-02-05 257.25 2
2008-02-06 262.75 3
2008-02-07 237.00 4
2008-02-08 231.00 5
Now .. do inner join of df1 and df2 that's all...
you will get below ouput
something like this
from pyspark.sql.window import Window
from pyspark.sql.functions import rowNumber
w = Window().orderBy()
df1 = .... // as showed above df1
df2 = .... // as shown above df2
df11 = df1.withColumn("columnindex", rowNumber().over(w))
df22 = df2.withColumn("columnindex", rowNumber().over(w))
newDF = df11.join(df22, df11.columnindex == df22.columnindex, 'inner').drop(df22.columnindex)
newDF.show()
New DF
C1 C2 C3
23397414 20875.7353 2008-02-04
5213970 20497.5582 2008-02-05
41323308 20935.7956 2008-02-06
123276113 18884.0477 2008-02-07
76456078 18389.9269 2008-02-08
Solution 2 : Another good way(probably this is best :)) in scala, which you can translate to pyspark :
/**
* Add Column Index to dataframe
*/
def addColumnIndex(df: DataFrame) = sqlContext.createDataFrame(
// Add Column index
df.rdd.zipWithIndex.map{case (row, columnindex) => Row.fromSeq(row.toSeq :+ columnindex)},
// Create schema
StructType(df.schema.fields :+ StructField("columnindex", LongType, false))
)
// Add index now...
val df1WithIndex = addColumnIndex(df1)
val df2WithIndex = addColumnIndex(df2)
// Now time to join ...
val newone = df1WithIndex
.join(df2WithIndex , Seq("columnindex"))
.drop("columnindex")
I thought I would share the python (pyspark) translation for answer #2 above from #Ram Ghadiyaram:
from pyspark.sql.functions import col
def addColumnIndex(df):
# Create new column names
oldColumns = df.schema.names
newColumns = oldColumns + ["columnindex"]
# Add Column index
df_indexed = df.rdd.zipWithIndex().map(lambda (row, columnindex): \
row + (columnindex,)).toDF()
#Rename all the columns
new_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx],
newColumns[idx]), xrange(len(oldColumns)), df_indexed)
return new_df
# Add index now...
df1WithIndex = addColumnIndex(df1)
df2WithIndex = addColumnIndex(df2)
#Now time to join ...
newone = df1WithIndex.join(df2WithIndex, col("columnindex"),
'inner').drop("columnindex")
for python3 version,
from pyspark.sql.types import StructType, StructField, LongType
def with_column_index(sdf):
new_schema = StructType(sdf.schema.fields + [StructField("ColumnIndex", LongType(), False),])
return sdf.rdd.zipWithIndex().map(lambda row: row[0] + (row[1],)).toDF(schema=new_schema)
df1_ci = with_column_index(df1)
df2_ci = with_column_index(df2)
join_on_index = df1_ci.join(df2_ci, df1_ci.ColumnIndex == df2_ci.ColumnIndex, 'inner').drop("ColumnIndex")
I referred to his(#Jed) answer
from pyspark.sql.functions import col
def addColumnIndex(df):
# Get old columns names and add a column "columnindex"
oldColumns = df.columns
newColumns = oldColumns + ["columnindex"]
# Add Column index
df_indexed = df.rdd.zipWithIndex().map(lambda (row, columnindex): \
row + (columnindex,)).toDF()
#Rename all the columns
oldColumns = df_indexed.columns
new_df = reduce(lambda data, idx:data.withColumnRenamed(oldColumns[idx],
newColumns[idx]), xrange(len(oldColumns)), df_indexed)
return new_df
# Add index now...
df1WithIndex = addColumnIndex(df1)
df2WithIndex = addColumnIndex(df2)
#Now time to join ...
newone = df1WithIndex.join(df2WithIndex, col("columnindex"),
'inner').drop("columnindex")
This answer solved it for me:
import pyspark.sql.functions as sparkf
# This will return a new DF with all the columns + id
res = df.withColumn('id', sparkf.monotonically_increasing_id())
Credit to Arkadi T
Here is an simple example that can help you even if you have already solve the issue.
//create First Dataframe
val df1 = spark.sparkContext.parallelize(Seq(1,2,1)).toDF("lavel1")
//create second Dataframe
val df2 = spark.sparkContext.parallelize(Seq((1.0, 12.1), (12.1, 1.3), (1.1, 0.3))). toDF("f1", "f2")
//Combine both dataframe
val combinedRow = df1.rdd.zip(df2.rdd). map({
//convert both dataframe to Seq and join them and return as a row
case (df1Data, df2Data) => Row.fromSeq(df1Data.toSeq ++ df2Data.toSeq)
})
// create new Schema from both the dataframe's schema
val combinedschema = StructType(df1.schema.fields ++ df2.schema.fields)
// Create a new dataframe from new row and new schema
val finalDF = spark.sqlContext.createDataFrame(combinedRow, combinedschema)
finalDF.show
Expanding on Jed's answer, in response to Ajinkya's comment:
To get the same old column names, you need to replace "old_cols" with a column list of the newly named indexed columns. See my modified version of the function below
def add_column_index(df):
new_cols = df.schema.names + ['ix']
ix_df = df.rdd.zipWithIndex().map(lambda (row, ix): row + (ix,)).toDF()
tmp_cols = ix_df.schema.names
return reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx], new_cols[idx]), xrange(len(tmp_cols)), ix_df)
Not the better way performance wise.
df3=df1.crossJoin(df2).show(3)
To merge columns from two different dataframe you have first to create a column index and then join the two dataframes. Indeed, two dataframes are similar to two SQL tables. To make a connection you have to join them.
If you don't care about the final order of the rows you can generate the index column with monotonically_increasing_id().
Using the following code you can check that monotonically_increasing_id generates the same index column in both dataframes (at least up to a billion of rows), so you won't have any error in the merged dataframe.
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
sample_size = 1E9
sdf1 = spark.range(1, sample_size).select(F.col("id").alias("id1"))
sdf2 = spark.range(1, sample_size).select(F.col("id").alias("id2"))
sdf1 = sdf1.withColumn("idx", sf.monotonically_increasing_id())
sdf2 = sdf2.withColumn("idx", sf.monotonically_increasing_id())
sdf3 = sdf1.join(sdf2, 'idx', 'inner')
sdf3 = sdf3.withColumn("diff", F.col("id1")-F.col("id2")).select("diff")
sdf3.filter(F.col("diff") != 0 ).show()
You can use a combination of monotonically_increasing_id (guaranteed to always be increasing) and row_number (guaranteed to always give the same sequence). You cannot use row_number alone because it needs to be ordered by something. So here we order by monotonically_increasing_id. I am using Spark 2.3.1 and Python 2.7.13.
from pandas import DataFrame
from pyspark.sql.functions import (
monotonically_increasing_id,
row_number)
from pyspark.sql import Window
DF1 = spark.createDataFrame(DataFrame({
'C1': [23397414, 5213970, 41323308, 123276113, 76456078],
'C2': [20875.7353, 20497.5582, 20935.7956, 18884.0477, 18389.9269]}))
DF2 = spark.createDataFrame(DataFrame({
'C3':['2008-02-04', '2008-02-05', '2008-02-06', '2008-02-07', '2008-02-08']}))
DF1_idx = (
DF1
.withColumn('id', monotonically_increasing_id())
.withColumn('columnindex', row_number().over(Window.orderBy('id')))
.select('columnindex', 'C1', 'C2'))
DF2_idx = (
DF2
.withColumn('id', monotonically_increasing_id())
.withColumn('columnindex', row_number().over(Window.orderBy('id')))
.select('columnindex', 'C3'))
DF_complete = (
DF1_idx
.join(
other=DF2_idx,
on=['columnindex'],
how='inner')
.select('C1', 'C2', 'C3'))
DF_complete.show()
+---------+----------+----------+
| C1| C2| C3|
+---------+----------+----------+
| 23397414|20875.7353|2008-02-04|
| 5213970|20497.5582|2008-02-05|
| 41323308|20935.7956|2008-02-06|
|123276113|18884.0477|2008-02-07|
| 76456078|18389.9269|2008-02-08|
+---------+----------+----------+

Drop previous pandas tables after merged into 1

I want to merge two dataframes together and then delete the first one to create space in RAM.
df1 = pd.read_csv(filepath, index_col='False')
df2 = pd.read_csv(filepath, index_col='False')
df3 = pd.read_csv(filepath, index_col='False')
df4 = pd.read_csv(filepath, index_col='False')
result = df1.merge(df2, on='column1', how='left', left_index='True', copy='False')
result2 = result.merge(df3, on='column1', how='left', left_index='True', copy='False')
Ideally what I would like to do after this is delete all of df1, df2, df3 and have the result2 dataframe left.
It's better NOT to produce unnecessary DFs:
file_list = glob.glob('/path/to/file_mask*.csv')
df = pd.read_csv(file_list[0], index_col='False')
for f in file_list[1:]:
df = df.merge(pd.read_csv(f, index_col='False'), on='column1', how='left')
PS IMO you can't (at least shouldn't) mix up on and left_index parameters. Maybe you meant right_on and left_index - that would be OK
Just use del
del df1, df2, df3, df4

Resources