Related
I want to calculate lags of multiple columns. I am able to do that for each column separately as shown below. How can I avoid the duplicate groupby and sorting.
### Pandas previous week values
search = search.assign(asp_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['asp'].shift(1))\
.assign(lbb_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['lbb'].shift(1))\
.assign(repoos_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['repoos'].shift(1))\
.assign(ordered_units_lstwk2 = search.sort_values(by = 'firstdayofweek').groupby('asin_bk')['ordered_units'].shift(1))
Try:
search = search.join(search.sort_values(by = 'firstdayofweek')
.groupby('asin_bk')[['asp','lbb','repoos','ordered_units']]
.shift().add_suffix('_lstwk2'))
Hello i would like to make my own hash function with the key being a column "arrival delay"
the code that i had at the moment is
# this is for the flights
def import_parse_rdd(data):
# create rdd
rdd = sc.textFile(data)
# remove the header
header = rdd.first()
rdd = rdd.filter(lambda row: row != header) #filter out header
# split by comma
split_rdd = rdd.map(lambda line: line.split(','))
row_rdd = split_rdd.map(lambda line: Row(
YEAR = int(line[0]),MONTH = int(line[1]),DAY = int(line[2]),DAY_OF_WEEK = int(line[3])
,AIRLINE = line[4],FLIGHT_NUMBER = int(line[5]),
TAIL_NUMBER = line[6],ORIGIN_AIRPORT = line[7],DESTINATION_AIRPORT = line[8],
SCHEDULED_DEPARTURE = line[9],DEPARTURE_TIME = line[10],DEPARTURE_DELAY = 0 if "".__eq__(line[11]) else float(line[11]),TAXI_OUT = 0 if "".__eq__(line[12]) else float(line[12]),
WHEELS_OFF = line[13],SCHEDULED_TIME = line[14],ELAPSED_TIME = 0 if "".__eq__(line[15]) else float(line[15]),AIR_TIME = 0 if "".__eq__(line[16]) else float(line[16]),DISTANCE = 0 if "".__eq__(line[17]) else float(line[17]),WHEELS_ON = line[18],TAXI_IN = 0 if "".__eq__(line[19]) else float(line[19]),
SCHEDULED_ARRIVAL = line[20],ARRIVAL_TIME = line[21],ARRIVAL_DELAY = 0 if "".__eq__(line[22]) else float(line[22]),DIVERTED = line[23],CANCELLED = line[24],CANCELLATION_REASON = line[25],AIR_SYSTEM_DELAY = line[26],
SECURITY_DELAY = line[27],AIRLINE_DELAY = line[28],LATE_AIRCRAFT_DELAY = line[29],WEATHER_DELAY = line[30])
)
return row_rdd
if i take flight_rdd.take(1)
[Row(YEAR=2015, MONTH=6, DAY=26, DAY_OF_WEEK=5, AIRLINE='EV', FLIGHT_NUMBER=4951, TAIL_NUMBER='N707EV', ORIGIN_AIRPORT='BHM', DESTINATION_AIRPORT='LGA', SCHEDULED_DEPARTURE='630', DEPARTURE_TIME='629', DEPARTURE_DELAY=-1.0, TAXI_OUT=13.0, WHEELS_OFF='642', SCHEDULED_TIME='155', ELAPSED_TIME=141.0, AIR_TIME=113.0, DISTANCE=866.0, WHEELS_ON='935', TAXI_IN=15.0, SCHEDULED_ARRIVAL='1005', ARRIVAL_TIME='950', ARRIVAL_DELAY=-15.0, DIVERTED='0', CANCELLED='0', CANCELLATION_REASON='', AIR_SYSTEM_DELAY='', SECURITY_DELAY='', AIRLINE_DELAY='', LATE_AIRCRAFT_DELAY='', WEATHER_DELAY='')]
is the output
i would like to make a user defined hash partitioning function with the key being the ARRIVAL_DELAY column.
If i could i would also like Min and Max value in the ARRIVAL_DELAY column to be used as a key to determine the distribution of the partition.
the furthest i have gone is that
flight_rdd.partitionBy(number of parts, key)
is what i understand
YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2015,2,6,5,OO,6271,N937SW,FAR,DEN,1712,1701,-11,15,1716,123,117,95,627,1751,7,1815,1758,-17,0,0,,,,,,
2015,1,19,1,AA,1605,N496AA,DFW,ONT,1740,1744,4,15,1759,193,198,175,1188,1854,8,1853,1902,9,0,0,,,,,,
2015,3,8,7,NK,1068,N519NK,LAS,CLE,2220,2210,-10,12,2222,238,229,208,1824,450,9,518,459,-19,0,0,,,,,,
2015,9,21,1,AA,1094,N3EDAA,DFW,BOS,1155,1155,0,12,1207,223,206,190,1562,1617,4,1638,1621,-17,0,0,,,,,,
this is the unprocessed version of the data set
Hello i would like to convert empty string to 0 of my RDD.
I have read 20 files and they are in like this formation.
YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2015,2,6,5,OO,6271,N937SW,FAR,DEN,1712,1701,-11,15,1716,123,117,95,627,1751,7,1815,1758,-17,0,0,,,,,,
2015,1,19,1,AA,1605,N496AA,DFW,ONT,1740,1744,4,15,1759,193,198,175,1188,1854,8,1853,1902,9,0,0,,,,,,
2015,3,8,7,NK,1068,N519NK,LAS,CLE,2220,2210,-10,12,2222,238,229,208,1824,450,9,518,459,-19,0,0,,,,,,
2015,9,21,1,AA,1094,N3EDAA,DFW,BOS,1155,1155,0,12,1207,223,206,190,1562,1617,4,1638,1621,-17,0,0,,,,,,
i would like to fill these empty strings with the number 0 to them
def import_parse_rdd(data):
# create rdd
rdd = sc.textFile(data)
# remove the header
header = rdd.first()
rdd = rdd.filter(lambda row: row != header) #filter out header
# split by comma
split_rdd = rdd.map(lambda line: line.split(','))
row_rdd = split_rdd.map(lambda line: Row(
YEAR = int(line[0]),MONTH = int(line[1]),DAY = int(line[2]),DAY_OF_WEEK = int(line[3])
,AIRLINE = line[4],FLIGHT_NUMBER = int(line[5]),
TAIL_NUMBER = line[6],ORIGIN_AIRPORT = line[7],DESTINATION_AIRPORT = line[8],
SCHEDULED_DEPARTURE = line[9],DEPARTURE_TIME = line[10],DEPARTURE_DELAY = (line[11]),TAXI_OUT = (line[12]),
WHEELS_OFF = line[13],SCHEDULED_TIME = line[14],ELAPSED_TIME = (line[15]),AIR_TIME = (line[16]),DISTANCE = (line[17]),WHEELS_ON = line[18],TAXI_IN = (line[19]),
SCHEDULED_ARRIVAL = line[20],ARRIVAL_TIME = line[21],ARRIVAL_DELAY = line[22],DIVERTED = line[23],CANCELLED = line[24],CANCELLATION_REASON = line[25],AIR_SYSTEM_DELAY = line[26],
SECURITY_DELAY = line[27],AIRLINE_DELAY = line[28],LATE_AIRCRAFT_DELAY = line[29],WEATHER_DELAY = line[30])
)
return row_rdd
the above is the code i am running.
I am working with RDD ROW OBJECTS not a dataframe
You can use na.fill("0") to replace all nulls with "0" strings.
spark.read.csv("path/to/file").na.fill(value="0").show()
In case you need integers, you can change the schema to convert string columns to integers.
You could add this to your dataframe to apply the change to a column named 'col_name'
from pyspark.sql import functions as F
(...)
.withColumn('col_name', F.regexp_replace('col_name', ' ', 0))
You could use this syntax directly in your code
You can add if-else condition while creating Row.
Let's consider WEATHER_DELAY.
row_rdd = split_rdd.map(lambda line: Row(#allothercols,
WEATHER_DELAY = 0 if "".__eq__(line[30]) else line[30])
Please allow me another try for your problem, using foreach() method dedicated to rdd.
def f(x) = x.replace(' ', 0)
(...)
row_rdd = row_rdd.foreach(f) # to be added at the end of your script
I am trying to split an oracle table based on values in a column (Hospital Names). The data set is ~3 Mil rows across 66 columns. I'm trying to write data for 1 hospital from 3 different table into 1 excel workbook in 3 different sheets.
I have a running code which worked for ~700K rows but the new set is too large and I run into memory problems. I tried to modify my code to hit the database each time for a hospital name using a for loop. But I get xlsx error of closing it explicitly.
import cx_Oracle
import getpass
import xlsxwriter
import pandas as pd
path = "C:\HN\1"
p = getpass.getpass()
# Connecting to Oracle
myusername = 'CN138609'
dsn_tns = cx_Oracle.makedsn('oflc1exa03p-vip.centene.com', '1521', service_name='IGX_APP_P')
conn = cx_Oracle.connect(user=myusername, password=p, dsn=dsn_tns)
sql_4 = "select distinct hospital_name from HN_Hosp_Records"
df4 = pd.read_sql(sql_4,conn)
hospital_name = list(df4['HOSPITAL_NAME'])
for x in hospital_name:
hosp_name = {"hosp" : x}
sql_1 = "select * from HN_Hosp_Records where hospital_name = :hosp"
sql_2 = "select * from HN_CAP_Claims_Not_In_DHCS where hospital_name = :hosp"
sql_3 = "select * from HN_Denied_Claims where hospital_name = :hosp"
df1 = pd.read_sql(sql_1,conn,params=hosp_name)
df2 = pd.read_sql(sql_2,conn,params=hosp_name)
df3 = pd.read_sql(sql_3,conn,params=hosp_name)
df_dhcs = df1.loc[df1['HOSPITAL_NAME'] == x]
df_dw = df2.loc[df2['HOSPITAL_NAME'] == x]
df_denied = df3.loc[df3['HOSPITAL_NAME'] == x]
# Create a new excel workbook
writer = pd.ExcelWriter(path + x + "_HNT_P2_REC_05062019.xlsx", engine='xlsxwriter')
# Write each dataframe to a different worksheet.
df_dhcs.to_excel(writer, sheet_name="DHCS")
df_dw.to_excel(writer, sheet_name = "Not In DHCS")
df_denied.to_excel(writer, sheet_name = "Denied")
writer.close()
Here is the warning/error I'm getting. The code doesn't stop but no file is being output:
File "C:\ProgramData\Anaconda3\lib\site-packages\xlsxwriter\workbook.py", line 153, in del
raise Exception("Exception caught in workbook destructor. "
Exception: Exception caught in workbook destructor. Explicit close() may be required for workbook.
I solved it. instead of binding variable using %s was the trick.
Fist row in Data File:
1,Maria,Anders,Berlin,Germany,0300174321
f = open("Customer.csv", "r")
for row in f.readlines(i):
a = row
x = a.split(",")
ID1 = print(x[0].replace("",""))
FIRST_NM1 = print(x[1])
LAST_NM1 = print(x[2])
CITY1 = print(x[3])
COUNTRY1 = print(x[4])
PHONE1 = print(x[5])
cursor = cs.cursor()
cursor.execute("INSERT INTO sales.dbo.Customer_temp (ID,FIRST_NM,LAST_NM,CITY,COUNTRY,PHONE) VALUES ('%s','%s','%s','%s','%s','%s')" %(ID1,FIRST_NM1,LAST_NM1,CITY1,COUNTRY1,PHONE1))
cs.commit();
But it is inserting None in all rows so could you please suggest me.
Instead of printing the values you need to assign them;
FIRST_NM1 = x[1]
LAST_NM1 = x[2]
CITY1 = x[3]
etc..