Replace empty string with null values in RDD - apache-spark
Hello i would like to convert empty string to 0 of my RDD.
I have read 20 files and they are in like this formation.
YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2015,2,6,5,OO,6271,N937SW,FAR,DEN,1712,1701,-11,15,1716,123,117,95,627,1751,7,1815,1758,-17,0,0,,,,,,
2015,1,19,1,AA,1605,N496AA,DFW,ONT,1740,1744,4,15,1759,193,198,175,1188,1854,8,1853,1902,9,0,0,,,,,,
2015,3,8,7,NK,1068,N519NK,LAS,CLE,2220,2210,-10,12,2222,238,229,208,1824,450,9,518,459,-19,0,0,,,,,,
2015,9,21,1,AA,1094,N3EDAA,DFW,BOS,1155,1155,0,12,1207,223,206,190,1562,1617,4,1638,1621,-17,0,0,,,,,,
i would like to fill these empty strings with the number 0 to them
def import_parse_rdd(data):
# create rdd
rdd = sc.textFile(data)
# remove the header
header = rdd.first()
rdd = rdd.filter(lambda row: row != header) #filter out header
# split by comma
split_rdd = rdd.map(lambda line: line.split(','))
row_rdd = split_rdd.map(lambda line: Row(
YEAR = int(line[0]),MONTH = int(line[1]),DAY = int(line[2]),DAY_OF_WEEK = int(line[3])
,AIRLINE = line[4],FLIGHT_NUMBER = int(line[5]),
TAIL_NUMBER = line[6],ORIGIN_AIRPORT = line[7],DESTINATION_AIRPORT = line[8],
SCHEDULED_DEPARTURE = line[9],DEPARTURE_TIME = line[10],DEPARTURE_DELAY = (line[11]),TAXI_OUT = (line[12]),
WHEELS_OFF = line[13],SCHEDULED_TIME = line[14],ELAPSED_TIME = (line[15]),AIR_TIME = (line[16]),DISTANCE = (line[17]),WHEELS_ON = line[18],TAXI_IN = (line[19]),
SCHEDULED_ARRIVAL = line[20],ARRIVAL_TIME = line[21],ARRIVAL_DELAY = line[22],DIVERTED = line[23],CANCELLED = line[24],CANCELLATION_REASON = line[25],AIR_SYSTEM_DELAY = line[26],
SECURITY_DELAY = line[27],AIRLINE_DELAY = line[28],LATE_AIRCRAFT_DELAY = line[29],WEATHER_DELAY = line[30])
)
return row_rdd
the above is the code i am running.
I am working with RDD ROW OBJECTS not a dataframe
You can use na.fill("0") to replace all nulls with "0" strings.
spark.read.csv("path/to/file").na.fill(value="0").show()
In case you need integers, you can change the schema to convert string columns to integers.
You could add this to your dataframe to apply the change to a column named 'col_name'
from pyspark.sql import functions as F
(...)
.withColumn('col_name', F.regexp_replace('col_name', ' ', 0))
You could use this syntax directly in your code
You can add if-else condition while creating Row.
Let's consider WEATHER_DELAY.
row_rdd = split_rdd.map(lambda line: Row(#allothercols,
WEATHER_DELAY = 0 if "".__eq__(line[30]) else line[30])
Please allow me another try for your problem, using foreach() method dedicated to rdd.
def f(x) = x.replace(' ', 0)
(...)
row_rdd = row_rdd.foreach(f) # to be added at the end of your script
Related
assign to grouped dataframe in Pandas
I want to calculate lags of multiple columns. I am able to do that for each column separately as shown below. How can I avoid the duplicate groupby and sorting. ### Pandas previous week values search = search.assign(asp_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['asp'].shift(1))\ .assign(lbb_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['lbb'].shift(1))\ .assign(repoos_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['repoos'].shift(1))\ .assign(ordered_units_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['ordered_units'].shift(1))
Try: search = search.join(search.sort_values(by = 'firstdayofweek') .groupby('asin_bk')[['asp','lbb','repoos','ordered_units']] .shift().add_suffix('_lstwk2'))
User defined Hash partioning in RDD with key
Hello i would like to make my own hash function with the key being a column "arrival delay" the code that i had at the moment is # this is for the flights def import_parse_rdd(data): # create rdd rdd = sc.textFile(data) # remove the header header = rdd.first() rdd = rdd.filter(lambda row: row != header) #filter out header # split by comma split_rdd = rdd.map(lambda line: line.split(',')) row_rdd = split_rdd.map(lambda line: Row( YEAR = int(line[0]),MONTH = int(line[1]),DAY = int(line[2]),DAY_OF_WEEK = int(line[3]) ,AIRLINE = line[4],FLIGHT_NUMBER = int(line[5]), TAIL_NUMBER = line[6],ORIGIN_AIRPORT = line[7],DESTINATION_AIRPORT = line[8], SCHEDULED_DEPARTURE = line[9],DEPARTURE_TIME = line[10],DEPARTURE_DELAY = 0 if "".__eq__(line[11]) else float(line[11]),TAXI_OUT = 0 if "".__eq__(line[12]) else float(line[12]), WHEELS_OFF = line[13],SCHEDULED_TIME = line[14],ELAPSED_TIME = 0 if "".__eq__(line[15]) else float(line[15]),AIR_TIME = 0 if "".__eq__(line[16]) else float(line[16]),DISTANCE = 0 if "".__eq__(line[17]) else float(line[17]),WHEELS_ON = line[18],TAXI_IN = 0 if "".__eq__(line[19]) else float(line[19]), SCHEDULED_ARRIVAL = line[20],ARRIVAL_TIME = line[21],ARRIVAL_DELAY = 0 if "".__eq__(line[22]) else float(line[22]),DIVERTED = line[23],CANCELLED = line[24],CANCELLATION_REASON = line[25],AIR_SYSTEM_DELAY = line[26], SECURITY_DELAY = line[27],AIRLINE_DELAY = line[28],LATE_AIRCRAFT_DELAY = line[29],WEATHER_DELAY = line[30]) ) return row_rdd if i take flight_rdd.take(1) [Row(YEAR=2015, MONTH=6, DAY=26, DAY_OF_WEEK=5, AIRLINE='EV', FLIGHT_NUMBER=4951, TAIL_NUMBER='N707EV', ORIGIN_AIRPORT='BHM', DESTINATION_AIRPORT='LGA', SCHEDULED_DEPARTURE='630', DEPARTURE_TIME='629', DEPARTURE_DELAY=-1.0, TAXI_OUT=13.0, WHEELS_OFF='642', SCHEDULED_TIME='155', ELAPSED_TIME=141.0, AIR_TIME=113.0, DISTANCE=866.0, WHEELS_ON='935', TAXI_IN=15.0, SCHEDULED_ARRIVAL='1005', ARRIVAL_TIME='950', ARRIVAL_DELAY=-15.0, DIVERTED='0', CANCELLED='0', CANCELLATION_REASON='', AIR_SYSTEM_DELAY='', SECURITY_DELAY='', AIRLINE_DELAY='', LATE_AIRCRAFT_DELAY='', WEATHER_DELAY='')] is the output i would like to make a user defined hash partitioning function with the key being the ARRIVAL_DELAY column. If i could i would also like Min and Max value in the ARRIVAL_DELAY column to be used as a key to determine the distribution of the partition. the furthest i have gone is that flight_rdd.partitionBy(number of parts, key) is what i understand YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY 2015,2,6,5,OO,6271,N937SW,FAR,DEN,1712,1701,-11,15,1716,123,117,95,627,1751,7,1815,1758,-17,0,0,,,,,, 2015,1,19,1,AA,1605,N496AA,DFW,ONT,1740,1744,4,15,1759,193,198,175,1188,1854,8,1853,1902,9,0,0,,,,,, 2015,3,8,7,NK,1068,N519NK,LAS,CLE,2220,2210,-10,12,2222,238,229,208,1824,450,9,518,459,-19,0,0,,,,,, 2015,9,21,1,AA,1094,N3EDAA,DFW,BOS,1155,1155,0,12,1207,223,206,190,1562,1617,4,1638,1621,-17,0,0,,,,,, this is the unprocessed version of the data set
Python: Copying certain columns to an empty dataframe using For loop
wo = "C:/temp/temp/WO.xlsx" dfwo = pd.read_excel(wo) columnnames = ["TicketID","CreateDate","Status","Summary","CreatedBy","Company"] main = pd.DataFrame(columns = columnnames) for i in range(0,15): print(i) main["TicketID"][i] = dfwo["WO ID"][i] main["CreateDate"][i] = dfwo["WO Create TimeStamp"][i] main["Status"][i] = dfwo["Status"][i] main["Summary"][i] = dfwo["WO Summary"][i] main["CreatedBy"][i] = dfwo["Submitter Full Name"][i] main["Company"][i] = dfwo["Company"][i] I am trying to copy selected columns from 1 df to another. dfwo is a df derived from Excel Main is an empty dataframe and has selected columns from dfwo When I run this code, it gives me the error, "IndexError: index 0 is out of bounds for axis 0 with size 0" Any suggestions pls?
wo = "C:/temp/temp/WO.xlsx" dfwo = pd.read_excel(wo) columnnames =["TicketID","CreateDate","Status","Summary","CreatedBy","Company"] main = dfwo[columnnames] new_col_names = { "TicketID":"WO ID", "CreateDate":"WO Create TimeStamp", "Status":"Status", "Summary":"WO Summary", "CreatedBy":"Submitter Full Name", "Company":"Company" } main.rename(columns = new_col_names,inplace = True)
How to insert values of variables dynamically in SQL Server Database using python script
Fist row in Data File: 1,Maria,Anders,Berlin,Germany,0300174321 f = open("Customer.csv", "r") for row in f.readlines(i): a = row x = a.split(",") ID1 = print(x[0].replace("","")) FIRST_NM1 = print(x[1]) LAST_NM1 = print(x[2]) CITY1 = print(x[3]) COUNTRY1 = print(x[4]) PHONE1 = print(x[5]) cursor = cs.cursor() cursor.execute("INSERT INTO sales.dbo.Customer_temp (ID,FIRST_NM,LAST_NM,CITY,COUNTRY,PHONE) VALUES ('%s','%s','%s','%s','%s','%s')" %(ID1,FIRST_NM1,LAST_NM1,CITY1,COUNTRY1,PHONE1)) cs.commit(); But it is inserting None in all rows so could you please suggest me.
Instead of printing the values you need to assign them; FIRST_NM1 = x[1] LAST_NM1 = x[2] CITY1 = x[3] etc..
Compare Two dataframes add mis matched values as a new column in Spark
Difference between two records is: df1.except(df2) Its getting results like this How to compare two dataframes and what changes, and where & which column have changes, add this value as a column. Expected output like this
Join the two dataframe on the primary key, later using a with column and UDF pass the both column values(old and new values), in UDF compare the data and return the value if not same. val check = udf ( (old_val:String,new_val:String) => if (old_val == new_val) new_val else "") df_check= df .withColumn("Check_Name",check(df.col("name"),df.col("new_name"))) .withColumn("Check_Namelast",check(df.col("lastname"),df.col("new_lastname"))) Or Def function def fn(old_df:Dataframe,new_df:Dataframe) : Dataframe = { val old_df_array = old_df.collect() //make df to array to loop thru val new_df_array = new_df.collect() //make df to array to loop thru var value_change : Array[String] = "" val count = old_df.count val row_count = old_df.coloumn val row_c = row.length val coloumn_name = old_df.coloumn for (i to count ) //loop thru all rows { var old = old_df_array.Map(x => x.split(",")) var new = new_df_array.Map(x => x.split(",")) for (j to row_c ) //loop thru all coloumn { if( old(j) != new(j) ) { value_change = value_change + coloumn_name(j) " has value changed" ///this will add all changes in one full row } //append to array append j(0) //primary key append value_change //Remarks coloumn } } //convert array to df }