How to transform a dataframe based on if,else conditions? - python-3.x

I am trying to build a function which transform a dataframe based on certain conditions but I am getting a Systax Error. I am not sure what I am doing wrong. Any help will be appreciated. Thank you!
import pandas as pd
from datetime import datetime
from datetime import timedelta
df=pd.read_csv('example1.csv')
df.columns =(['dtime','kW'])
df['dtime'] = pd.to_datetime(df['dtime'])
df.head(5)
dtime kW
0 2019-08-27 23:30:00 0.016
1 2019-08-27 23:00:00 0
2 2019-08-27 22:30:00 0.016
3 2019-08-27 22:00:00 0.016
4 2019-08-27 21:30:00 0
def transdf(df):
a=df.loc[0,'dtime']
b=df.loc[1,'dtime']
c=a-b
minutes = c.total_seconds() / 60
d=int(minutes)
#d can be only 15 ,30 or 60
if d==15:
return df=df.set_index('dtime').asfreq('-15T',fill_value='Missing')
elif d==30:
return df=df.set_index('dtime').asfreq('-30T',fill_value='Missing')
elif d==60:
return df=df.set_index('dtime').asfreq('-60T',fill_value='Missing')
else:
return None

first. It is more efficient to have the return statement after the else at the end of your code. Inside each of the cases just update the value for df. Return is part of your function, not the if statement that's why you are getting errors.
def transform(df):
a = df.loc[0, 'dtime']
b = df.loc[1, 'dtime']
c = a - b
minutes = c.total_seconds() / 60
d=int(minutes)
#d can be only 15 ,30 or 60
if d==15:
df= df.set_index('dtime').asfreq('-15T',fill_value='Missing')
elif d==30:
df= df.set_index('dtime').asfreq('-30T',fill_value='Missing')
elif d==60:
df= df.set_index('dtime').asfreq('-60T',fill_value='Missing')
else:
None
return dfere

Related

Finding time difference between the message status in python

So, I want to calculate the time differences; and the file looks something like this
id
message_id
send_date
status
0
5f74b996a2b7e
2020-10-01 00:00:07
sent
1
5f74b996a2b7e
2020-10-01 00:00:09
delivered
2
5f74b99e85b3c
2020-10-02 02:00:14
sent
3
5f74b99e85b3c
2020-10-02 02:01:16
delivered
4
5f74b99e85b3c
2020-10-02 08:06:49
read
5
5f74b996a2b7e
2020-10-02 15:16:32
read
6
5f9d97ff1af9e
2020-10-14 13:45:43
sent
7
5f9d97ff1af9e
2020-10-14 13:45:45
delivered
8
5f9d97ff1af9e
2020-10-14 13:50:48
read
9
5f74b9a35b6c5
2020-10-16 19:01:19
sent
10
5f74b9a35b6c5
2020-10-16 19:01:25
deleted
Inside is id which increment, message_id is unique to each message, send_date is the time, status is the message status (it has 5 statuses; sent, delivered, read, failed, and deleted).
I wanted to calculate the time differences when the message was sent then delivered, if delivered then read.
I know something like this can be handy, but I wasn't sure how to assign it uniquely to each of the message_id
from datetime import datetime
s1 = '2020-10-14 13:45:45'
s2 = '2020-10-14 13:50:48' # for example
FMT = '%Y-%m-%d %H:%M:%S'
tdelta = datetime.strptime(s2, FMT) - datetime.strptime(s1, FMT)
print(tdelta)
Ref: https://stackoverflow.com/questions/3096953/how-to-calculate-the-time-interval-between-two-time-strings
The expected output would be,
message_id
delivered_diff
read_diff
deleted_diff
0
5f74b996a2b7e
00:00:02
1 day, 15:16:23
1
5f74b99e85b3c
00:01:02
6:05:33
2
5f9d97ff1af9e
00:00:02
0:05:03
3
5f74b9a35b6c5
0:00:06
You can use pandas to do this and datetime.
The code is commented to better understand and realized with python 3.8.
import datetime
import pandas as pd
def time_delta(a, b):
return datetime.datetime.strptime(b, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(a, '%Y-%m-%d %H:%M:%S') # calculate the timedelta
def calculate_diff(val, first_status, second_status):
if not val['status'].str.contains(first_status).any() or not val['status'].str.contains(second_status).any(): # Check if the status exist
return ''
a = val.loc[val['status'] == first_status, 'send_date'].values[0] # Get the first send_date value for the first status value
b = val.loc[val['status'] == second_status, 'send_date'].values[0] # Get the first send_date value for the second status value
return time_delta(a, b) # calculate the delta
df = pd.read_csv('test.csv', sep=';') # Load csv file with ; as separator
grouped = df.groupby('message_id') # Group by message ids
final_df = pd.DataFrame(columns=['message_id', 'delivered_diff', 'read_diff', 'deleted_diff']) # create empty result dataframe
for message_id, values in grouped: # calculate the results for each group
delivered_diff = calculate_diff(values, 'sent', 'delivered') # calculate delivered_diff as delta between sent status and delivered status
read_diff = calculate_diff(values, 'delivered', 'read') # calculate read_diff as delta between delivered status and read status
deleted_diff = calculate_diff(values, 'sent', 'deleted') # calculate deleted_diff as delta between sent status and deleted status
res = {
'message_id': message_id,
'delivered_diff': delivered_diff,
'read_diff': read_diff,
'deleted_diff': deleted_diff
}
# append the results
final_df = final_df.append(res, ignore_index=True)
# print final result
print(final_df)
The result:
message_id delivered_diff read_diff deleted_diff
0 5f74b996a2b7e 0 days 00:00:02 1 days 15:16:23
1 5f74b99e85b3c 0 days 00:01:02 0 days 06:05:33
2 5f74b9a35b6c5 0 days 00:00:06
3 5f9d97ff1af9e 0 days 00:00:02 0 days 00:05:03
import pandas as pd
from datetime import datetime, timedelta
final_dict = []
data = pd.read_csv('data.csv', names=['id','unique_id','time','status'])
data['time'] = pd.to_datetime(data['time'])
# data.info()
groupByUniqueId = data.groupby('unique_id')
for name,group in groupByUniqueId:
for row in group.iterrows():
if row[1][3] == "sent":
sent = row[1][2]
if row[1][3] == "read":
final_dict.append({row[1][1]: {"read": str(sent - row[1][2])}})
elif row[1][3] == "delivered":
final_dict.append({row[1][1]: {"delivered":str(sent - row[1][2])}})
elif row[1][3] == "deleted":
final_dict.append({row[1][1]: {"deleted":str(sent - row[1][2])}})
print(final_dict)
Data Sample for CSV

month starting date and ending date between a range of date in python

the input is a range of date for which we need to find the starting date of the month and end date of the month of all date in between the interval. example is given below
input:
start date: 2018-6-15
end date: 2019-3-20
desired output:
[
["month starting date","month ending date"],
["2018-6-15","2018-6-30"],
["2018-7-1","2018-7-31"],
["2018-8-1","2018-8-31"],
["2018-9-1","2018-9-30"],
["2018-10-1","2018-10-31"],
["2018-11-1","2018-11-30"],
["2018-12-1","2018-12-31"],
["2019-1-1","2019-1-31"],
["2019-2-1","2019-2-28"],
["2019-3-1","2019-3-20"]
]
An option using pandas: create a date_range from start to end date, extract the month numbers from that as a pandas.Series, shift it 1 element forward and 1 element backward to retrieve a boolean mask where the months change (!=). Now you can create a DataFrame to work with or create a list of lists if you like.
Ex:
import pandas as pd
start_date, end_date = '2018-6-15', '2019-3-20'
dtrange = pd.date_range(start=start_date, end=end_date, freq='d')
months = pd.Series(dtrange .month)
starts, ends = months.ne(months.shift(1)), months.ne(months.shift(-1))
df = pd.DataFrame({'month_starting_date': dtrange[starts].strftime('%Y-%m-%d'),
'month_ending_date': dtrange[ends].strftime('%Y-%m-%d')})
# df
# month_starting_date month_ending_date
# 0 2018-06-15 2018-06-30
# 1 2018-07-01 2018-07-31
# 2 2018-08-01 2018-08-31
# 3 2018-09-01 2018-09-30
# 4 2018-10-01 2018-10-31
# 5 2018-11-01 2018-11-30
# 6 2018-12-01 2018-12-31
# 7 2019-01-01 2019-01-31
# 8 2019-02-01 2019-02-28
# 9 2019-03-01 2019-03-20
# as a list of lists:
l = [df.columns.values.tolist()] + df.values.tolist()
# l
# [['month_starting_date', 'month_ending_date'],
# ['2018-06-15', '2018-06-30'],
# ['2018-07-01', '2018-07-31'],
# ['2018-08-01', '2018-08-31'],
# ['2018-09-01', '2018-09-30'],
# ['2018-10-01', '2018-10-31'],
# ['2018-11-01', '2018-11-30'],
# ['2018-12-01', '2018-12-31'],
# ['2019-01-01', '2019-01-31'],
# ['2019-02-01', '2019-02-28'],
# ['2019-03-01', '2019-03-20']]
Note that I use strftime when I create the DataFrame. Do this if you want the output to be of dtype string. If you want to continue to work with datetime objects (timestamps), don't apply strftime.
This code is simple and uses standard python packages.
import calendar
from datetime import datetime, timedelta
def get_time_range_list(start_date, end_date):
date_range_list = []
while 1:
month_end = start_date.replace(day=calendar.monthrange(start_date.year, start_date.month)[1])
next_month_start = month_end + timedelta(days=1)
if next_month_start <= end_date:
date_range_list.append((start_date, month_end))
start_date = next_month_start
else:
date_range_list.append((start_date, end_date))
return date_range_list

Subtract two columns using conditions

I have two date columns, I want to subtract the two columns based on conditions. First check for all the blanks in the first column and then check second column for blanks and the third condition check if the subtracted dates are less than one. If these conditions are satisfied, carry out subtraction of the the two columns. Something like this:
'''if [Recommendation signed] = null or [Executed Date] = null or Duration.Days([Contract Executed Date]-[Recommendation signed]) < 1 then null else Duration.Days([Contract Executed Date]-[Recommendation signed])'''
You can do that using apply function. For example you want to store the value into a new column called day difference.
Make sure these were datetime columns (if they're not apply to_datetime function).
df['Recommendation signed'] = pd.to_datetime(data['Recommendation signed']).dt.date
df['Executed Date'] = pd.to_datetime(data['Executed Date']).dt.date
df['Contract Executed Date'] = pd.to_datetime(data['Contract Executed Date']).dt.date
def substract_columns(row):
if pd.isnull(row['Recommendation signed']) or pd.isnull(row['Executed Date']) or ((row['Contract Executed Date'] - row['Recommendation signed']) == '0 days'):
return None
else:
row['Contract Executed Date'] - row['Recommendation signed']
df['day difference'] = df.apply(substract_columns, axis=1)
Hope this helps.
Here's one way to do it. Since no data was provided I created my own generator. The solution is contained within find_duration and how it is used in df.apply(find_duration, axis=1).
from datetime import datetime, timedelta
from itertools import islice
import numpy as np
import pandas as pd
RECOMMENDATION_IS_PENDING = "RECOMMENDATION_IS_PENDING"
EXECUTION_IS_PENDING = "EXECUTION_IS_PENDING"
COMPLETED_IN_LESS_THAN_ONE_DAY = "COMPLETED_IN_LESS_THAN_ONE_DAY"
COMPLETED_IN_MORE_THAN_ONE_DAY = "COMPLETED_IN_MORE_THAN_ONE_DAY"
MIN_YEAR = 1900
MAX_YEAR = 2020
NUM_YEARS = MAX_YEAR - MIN_YEAR + 1
START_DATE = datetime(MIN_YEAR, 1, 1, 00, 00, 00)
END_DATE = START_DATE + timedelta(days=365 * NUM_YEARS)
NUM_RECORDS = 20
def random_datetime(rng, dt):
return START_DATE + (END_DATE - START_DATE) * rng.uniform()
def less_than_one_day(rng, dt):
hours = int(np.round(23.0 * rng.uniform()))
return dt + timedelta(hours=hours)
def more_than_one_day(rng, dt):
days = 1 + int(np.round(100.0 * rng.uniform()))
return dt + timedelta(days=days)
def null_datetime(rng, dt):
return None
class RecordGenerator:
PROBABILITIES = {
RECOMMENDATION_IS_PENDING: 0.1,
EXECUTION_IS_PENDING: 0.2,
COMPLETED_IN_LESS_THAN_ONE_DAY: 0.2,
COMPLETED_IN_MORE_THAN_ONE_DAY: 0.5,
}
GENERATORS = {
RECOMMENDATION_IS_PENDING: (null_datetime, random_datetime),
EXECUTION_IS_PENDING: (random_datetime, null_datetime),
COMPLETED_IN_LESS_THAN_ONE_DAY: (random_datetime, less_than_one_day),
COMPLETED_IN_MORE_THAN_ONE_DAY: (random_datetime, more_than_one_day),
}
def __init__(self, seed=0):
self.rng = np.random.RandomState(seed)
def __iter__(self):
while True:
res = self.rng.uniform()
for kind, val in self.PROBABILITIES.items():
res -= val
if res <= 0.0:
break
recommendation_signed_fn, execution_date_fn = self.GENERATORS[kind]
recommendation_signed = recommendation_signed_fn(self.rng, None)
execution_date = execution_date_fn(self.rng, recommendation_signed)
yield recommendation_signed, execution_date
def find_duration(df):
duration = df["execution_date"] - df["recommendation_signed"]
if duration is pd.NaT or duration < pd.Timedelta(days=1):
return None
return duration
if __name__ == "__main__":
records = RecordGenerator()
recommendation_signed_dates, execution_dates = zip(*islice(records, NUM_RECORDS))
df = pd.DataFrame.from_dict({
"recommendation_signed": recommendation_signed_dates,
"execution_date": execution_dates,
})
print(f"`recommendation_signed` is null: [{df['recommendation_signed'].isnull().sum()}]")
print(f"`execution_date` is null: [{df['execution_date'].isnull().sum()}]")
print(f"`completed_in_less_than_one_day`: [{((df['execution_date'] - df['recommendation_signed']) < pd.Timedelta(days=1)).sum()}]")
print(f"`completed_in_more_than_one_day`: [{((df['execution_date'] - df['recommendation_signed']) >= pd.Timedelta(days=1)).sum()}]")
df["completion_time"] = df.apply(find_duration, axis=1)
print(df)
Output:
`recommendation_signed` is null: [2]
`execution_date` is null: [2]
`completed_in_less_than_one_day`: [4]
`completed_in_more_than_one_day`: [12]
recommendation_signed execution_date completion_time
0 1986-06-25 08:07:14.808395 1986-08-25 08:07:14.808395 61 days
1 1951-03-25 17:08:27.986156 1951-05-30 17:08:27.986156 66 days
2 2007-11-01 03:42:35.672304 2007-11-02 01:42:35.672304 NaT
3 1995-09-26 12:52:16.917964 1995-09-27 00:52:16.917964 NaT
4 2011-12-03 23:24:45.808880 2011-12-11 23:24:45.808880 8 days
5 NaT 1902-06-12 22:41:33.183052 NaT
6 1994-02-04 07:01:47.052493 1994-05-03 07:01:47.052493 88 days
7 1996-08-19 20:06:42.217770 1996-10-05 20:06:42.217770 47 days
8 1914-04-21 14:09:37.598524 1914-06-25 14:09:37.598524 65 days
9 2014-03-25 07:15:55.137157 NaT NaT
10 1950-02-21 13:04:11.684479 1950-03-20 13:04:11.684479 27 days
11 1955-02-27 21:06:22.090510 1955-04-26 21:06:22.090510 58 days
12 NaT 1974-09-07 20:55:17.329968 NaT
13 1974-08-07 21:21:33.578522 1974-11-10 21:21:33.578522 95 days
14 1943-06-22 15:59:39.451885 1943-08-06 15:59:39.451885 45 days
15 1907-04-14 20:35:27.269379 1907-06-21 20:35:27.269379 68 days
16 1925-06-10 13:05:57.968982 1925-06-24 13:05:57.968982 14 days
17 1943-12-25 06:52:07.566032 1943-12-25 19:52:07.566032 NaT
18 2019-07-07 12:44:00.201327 2019-07-07 14:44:00.201327 NaT
19 1919-07-05 05:38:11.678570 NaT NaT
You could try something like this:
import numpy as np
from datetime import datetime, timedelta
df['Recommendation Signed'] = pd.to_datetime(df['Recommendation Signed'], errors='coerce')
df['Contract Executed Date'] = pd.to_datetime(df['Contract Executed Date'], errors='coerce')
df['date_difference'] = np.where(df['Recommendation Signed'].isnull() | df['Contract Executed Date'].isnull() | ((df['Contract Executed Date'] - df['Recommendation Signed'] ) < timedelta(days=1)), np.datetime64('NaT'), df['Contract Executed Date'] - df['Recommendation Signed'])

Pandas: create multiple aggregate columns and merge multiple data frames in an elegant way

I am using the following code to create a few new aggregated columns based on the column version. Then merged the 4 new data frames.
new_df = df[['version','duration']].groupby('version').mean().rename(columns=lambda x: ('mean_' + x)).reset_index().fillna(0)
new_df1 = df[['version','duration']].groupby('version').std().rename(columns=lambda x: ('std_' + x)).reset_index().fillna(0)
new_df2 = df[['version','ts']].groupby('version').min().rename(columns=lambda x: ('min_' + x)).reset_index().fillna(0)
new_df3 = df[['version','ts']].groupby('version').max().rename(columns=lambda x: ('max_' + x)).reset_index().fillna(0)
new_df3
import pandas
df_a = pandas.merge(new_df,new_df1, on = 'version')
df_b = pandas.merge(df_a,new_df2, on = 'version')
df_c = pandas.merge(df_b,new_df3, on = 'version')
df_c
The output looks like below:
version mean_duration std_duration min_ts max_ts
0 1400422 451 1 2018-02-28 09:42:15 2018-02-28 09:42:15
1 7626065 426 601 2018-01-25 11:01:58 2018-01-25 11:15:22
2 7689209 658 473 2018-01-30 11:09:31 2018-02-01 05:19:23
3 7702304 711 80 2018-01-30 17:49:18 2018-01-31 12:27:20
The code works fine, but I am wondering is there a more elegant/clean way to do this? Thank you!
Using functools reduce modify your result (merge)
import functools
l=[new_df1,new_df3,new_df3]
functools.reduce(lambda left,right: pd.merge(left,right,on=['version']), l)
Or let us using agg recreate what you need
s=df.groupby('version').agg({'duration':['mean','std'],'ts':['min','max']}).reset_index()
s.columns=s.columns.map('_'.join)

how to replace a cell in a pandas dataframe

After forming the below python pandas dataframe (for example)
import pandas
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pandas.DataFrame(data,columns=['Name','Age'])
If I iterate through it, I get
In [62]: for i in df.itertuples():
...: print( i.Index, i.Name, i.Age )
...:
0 Alex 10
1 Bob 12
2 Clarke 13
What I would like to achieve is to replace the value of a particular cell
In [67]: for i in df.itertuples():
...: if i.Name == "Alex":
...: df.at[i.Index, 'Age'] = 100
...:
Which seems to work
In [64]: df
Out[64]:
Name Age
0 Alex 100
1 Bob 12
2 Clarke 13
The problem is that when using a larger different dataset, and do:
First, I create a new column named like NETELEMENT with a default value of ""
I would like to replace the default value "" with the string that the function lookup_netelement returns
df['NETELEMENT'] = ""
for i in df.itertuples():
df.at[i.Index, 'NETELEMENT'] = lookup_netelement(i.PEER_SRC_IP)
print( i, lookup_netelement(i.PEER_SRC_IP) )
But what I get as a result is:
Pandas(Index=769, SRC_AS='', DST_AS='', COMMS='', SRC_COMMS=nan, AS_PATH='', SRC_AS_PATH=nan, PREF='', SRC_PREF='0', MED='0', SRC_MED='0', PEER_SRC_AS='0', PEER_DST_AS='', PEER_SRC_IP='x.x.x.x', PEER_DST_IP='', IN_IFACE='', OUT_IFACE='', PROTOCOL='udp', TOS='0', BPS=35200.0, SRC_PREFIX='', DST_PREFIX='', NETELEMENT='', IN_IFNAME='', OUT_IFNAME='') routerX
meaning that it should be:
NETELEMENT='routerX' instead of NETELEMENT=''
Could you please advise what I am doing wrong ?
EDIT: for reasons of completeness the lookup_netelement is defined as
def lookup_netelement(ipaddr):
try:
x = LOOKUP['conn'].hget('ipaddr;{}'.format(ipaddr), 'dev') or b""
except:
logger.error('looking up `ipaddr` for netelement caused `{}`'.format(repr(e)), exc_info=True)
x = b""
x = x.decode("utf-8")
return x
Hope you are looking for where for conditional replacement i.e
def wow(x):
return x ** 10
df['new'] = df['Age'].where(~(df['Name'] == 'Alex'),wow(df['Age']))
Output :
Name Age new
0 Alex 10 10000000000
1 Bob 12 12
2 Clarke 13 13
3 Alex 15 576650390625
Based on your edit your trying to apply the function i.e
df['new'] = df['PEER_SRC_IP'].apply(lookup_netelement)
Edit : For your comment on sending two columns, use lambda with axis 1 i.e
def wow(x,y):
return '{} {}'.format(x,y)
df.apply(lambda x : wow(x['Name'],x['Age']),1)

Resources