Python Pandas sum() the difference between two values - python-3.x

In my python code, using pandas i have to resample a datetimedata series and calculate diffs between a column values (the sum of diffs between values), i write this piece of code:
import pandas as pd
import datetime
from .models import Results, VarsResults
start_date = datetime.date(2021, 6, 21)
end_date = datetime.date(2021, 6, 24)
def calc_q(start_d, end_d):
start_d = start_date
end_d = end_date
var_results = VarsResults.objects.filter(
id_res__read_date__range=(start_d, end_d)
).select_related(
"id_res"
).values(
"id_res__read_date",
"id_res__unit_id",
"id_res__device_id",
"id_res__proj_code",
"var_val",
)
df = pd.DataFrame(list(var_results))
df['id_res__read_date'] = pd.to_datetime(df['id_res__read_date'])
df = df.set_index('id_res__read_date')
df_15 = df.resample('15min').sum()
return df_15
but i get the sum of the values itself.
example
... | 5
... | 3
... | 1
i get 9
i would the sum of the difference between values not the sum of the values:
in this case 4 (5-3 = 2 + 3-1 = 2, 2+2)
Is there a method in pandas using resample for manage this kind of clcultion?
So many thanks in advance
Manuel

The sum of all the differences is equal to the difference between the first element and the last one: if you work it out, all the other elements cancel out. In your data for example the 3 cancels out:
(5-3) + (3-1)
= 5 - 3 + 3 - 1 # - 3 and + 3 cancel out
= 5 - 1
I don't know how Pandas works, but you can simply do the equivalent of first_value - last_value.

Related

Python Pandas apply function not being applied to every row when using variables from a DataFrame

I have this weird Pandas problem, when I use the apply function using values from a data frame, it only gets applied to the first row:
import pandas as pd
# main data frame - to be edited
headerData = [['dataA', 'dataB']]
valuesData = [[10, 20], [10, 20]]
dfData = pd.DataFrame(valuesData, columns = headerData)
dfData.to_csv('MainData.csv', index=False)
readMainDataCSV = pd.read_csv('MainData.csv')
print(readMainDataCSV)
#variable data frame - pull values from this to edit main data frame
headerVariables = [['varA', 'varB']]
valuesVariables = [[2, 10]]
dfVariables = pd.DataFrame(valuesVariables, columns = headerVariables)
dfVariables.to_csv('Variables.csv', index=False)
readVariablesCSV = pd.read_csv('Variables.csv')
readVarA = readVariablesCSV['varA']
readVarB = readVariablesCSV['varB']
def formula(x):
return (x / readVarA) * readVarB
dfFormulaApplied = readMainDataCSV.apply(lambda x: formula(x))
print('\n', dfFormulaApplied)
Output:
dataA dataB
0 50.0 100.0
1 NaN NaN
But when I just use regular variables (not being called from a data frame), it functions just fine:
import pandas as pd
# main data frame - to be edited
headerData = [['dataA', 'dataB']]
valuesData = [[10, 20], [20, 40]]
dfData = pd.DataFrame(valuesData, columns = headerData)
dfData.to_csv('MainData.csv', index=False)
readMainDataCSV = pd.read_csv('MainData.csv')
print(readMainDataCSV)
# variables
readVarA = 2
readVarB = 10
def formula(x):
return (x / readVarA) * readVarB
dfFormulaApplied = readMainDataCSV.apply(lambda x: formula(x))
print('\n', dfFormulaApplied)
Output:
dataA dataB
0 50.0 100.0
1 100.0 200.0
Help please I'm pulling my hair out.
If you take readVarA and readVarB from the dataframe by selecting the column it is a pandas Series with an index, which gives a problem in the calculation (dividing a series by another series with a different index doesn't work).
You can take the first value from the series to get the value like this:
def formula(x):
return (x / readVarA[0]) * readVarB[0]

How to write from loop to dataframe

I'am trying to calculate 33 stock betas and write them to dataframe.
Unfortunately, I have an error in my code:
cannot concatenate object of type ""; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are vali
import pandas as pd
import numpy as np
stock1=pd.read_excel(r"C:\Users\Кир\Desktop\Uni\Master\Nasdaq\Financials 11.05\Nasdaq last\clean data\01.xlsx", '1') #read second sheet of excel file
stock2=pd.read_excel(r"C:\Users\Кир\Desktop\Uni\Master\Nasdaq\Financials 11.05\Nasdaq last\clean data\01.xlsx", '2') #read second sheet of excel file
stock2['stockreturn']=np.log(stock2.AdjCloseStock / stock2.AdjCloseStock.shift(1)) #stock ln return
stock2['SP500return']=np.log(stock2.AdjCloseSP500 / stock2.AdjCloseSP500.shift(1)) #SP500 ln return
stock2 = stock2.iloc[1:] #delete first row in dataframe
betas = pd.DataFrame()
for i in range(0,(len(stock2.AdjCloseStock)//52)-1):
betas = betas.append(stock2.stockreturn.iloc[i*52:(i+1)*52].cov(stock2.SP500return.iloc[i*52:(i+1)*52])/stock2.SP500return.iloc[i*52:(i+1)*52].cov(stock2.SP500return.iloc[i*52:(i+1)*52]))
My data looks like weekly stock and S&P index return for 33 years. So the output should have 33 betas.
I tried simplifying your code and creating an example. I think the problem is that your calculation returns a float. You want to make it a pd.Series. DataFrame.append takes:
DataFrame or Series/dict-like object, or list of these
np.random.seed(20)
df = pd.DataFrame(np.random.randn(33*53, 2),
columns=['a', 'b'])
betas = pd.DataFrame()
for year in range(len(df['a'])//52 -1):
# Take some data
in_slice = pd.IndexSlice[year*52:(year+1)*52]
numerator = df['a'].iloc[in_slice].cov(df['b'].iloc[in_slice])
denominator = df['b'].iloc[in_slice].cov(df['b'].iloc[in_slice])
# Do some calculations and create a pd.Series from the result
data = pd.Series(numerator / denominator, name = year)
# Append to the DataFrame
betas = betas.append(data)
betas.index.name = 'years'
betas.columns = ['beta']
betas.head():
beta
years
0 0.107669
1 -0.009302
2 -0.063200
3 0.025681
4 -0.000813

Identifying groups of two rows that satisfy three conditions in a dataframe

I have the df below and want to identify any two orders that satisfy all the following condtions:
Distance between pickups less than X miles
Distance between dropoffs less Y miles
Difference between order creation times less Z minutes
Would use haversine import haversine to calculate the difference in pickups for each row and difference in dropoffs for each row or order.
The df I currently have looks like the following:
DAY  Order pickup_lat pickup_long dropoff_lat dropoff_long created_time
1/3/19 234e 32.69 -117.1 32.63 -117.08 3/1/19 19:00
1/3/19 235d 40.73 -73.98 40.73 -73.99 3/1/19 23:21
1/3/19 253w 40.76 -73.99 40.76 -73.99 3/1/19 15:26
2/3/19 231y 36.08 -94.2 36.07 -94.21 3/2/19 0:14
3/3/19 305g 36.01 -78.92 36.01 -78.95 3/2/19 0:09
3/3/19 328s 36.76 -119.83 36.74 -119.79 3/2/19 4:33
3/3/19 286n 35.76 -78.78 35.78 -78.74 3/2/19 0:43
I want my output df to be any 2 orders or rows that satisfy the above conditions. What I am not sure of is how to calculate that for each row in the dataframe to return any two rows that satisfy those condtions.
I hope I am explaining my desired output correctly. Thanks for looking!
I don't know if it is an optimal solution, but I didn't come up with something different. What I have done:
created dataframe with all possible orders combination,
computed all needed measures and for all of the combinations, I added those measures column to the dataframe,
find the indices of the rows which fulfill the mentioned conditions.
The code:
#create dataframe with all combination
from itertools import combinations
index_comb = list(combinations(trips.index, 2))#trip, your dataframe
col_names = trips.columns
orders1= pd.DataFrame([trips.loc[c[0],:].values for c in index_comb],columns=trips.columns,index = index_comb)
orders2= pd.DataFrame([trips.loc[c[1],:].values for c in index_comb],columns=trips.columns,index = index_comb)
orders2 = orders2.add_suffix('_1')
combined = pd.concat([orders1,orders2],axis=1)
from haversine import haversine
def distance(row):
loc_0 = (row[0],row[1]) # (lat, lon)
loc_1 = (row[2],row[3])
return haversine(loc_0,loc_1,unit='mi')
#pickup diff
pickup_cols = ["pickup_long","pickup_lat","pickup_long_1","pickup_lat_1"]
combined[pickup_cols] = combined[pickup_cols].astype(float)
combined["pickup_dist_mi"] = combined[pickup_cols].apply(distance,axis=1)
#dropoff diff
dropoff_cols = ["dropoff_lat","dropoff_long","dropoff_lat_1","dropoff_long_1"]
combined[dropoff_cols] = combined[dropoff_cols].astype(float)
combined["dropoff_dist_mi"] = combined[dropoff_cols].apply(distance,axis=1)
#creation time diff
combined["time_diff_min"] = abs(pd.to_datetime(combined["created_time"])-pd.to_datetime(combined["created_time_1"])).astype('timedelta64[m]')
#Thresholds
Z = 600
Y = 400
X = 400
#find orders with below conditions
diff_time_Z = combined["time_diff_min"] < Z
pickup_dist_X = combined["pickup_dist_mi"]<X
dropoff_dist_Y = combined["dropoff_dist_mi"]<Y
contitions_idx = diff_time_Z & pickup_dist_X & dropoff_dist_Y
out = combined.loc[contitions_idx,["Order","Order_1","time_diff_min","dropoff_dist_mi","pickup_dist_mi"]]
The output for your data:
Order Order_1 time_diff_min dropoff_dist_mi pickup_dist_mi
(0, 5) 234e 328s 573.0 322.988195 231.300179
(1, 2) 235d 253w 475.0 2.072803 0.896893
(4, 6) 305g 286n 34.0 19.766096 10.233550
Hope I understand you well and that will help.
Using your dataframe as above. Drop the index. I'm presuming your created_time column is in datetime format.
import pandas as pd
from geopy.distance import geodesic
Cross merge the dataframe to get all possible combinations of 'Order'.
df_all = pd.merge(df.assign(key=0), df.assign(key=0), on='key').drop('key', axis=1)
Remove all the rows where the orders are equal.
df_all = df_all[-(df_all['Order_x'] == df_all['Order_y'])].copy()
Drop duplicate rows where Order_x, Order_y == [a, b] and [b, a]
# drop duplicate rows
# first combine Order_x and Order_y into a sorted list, and combine into a string
df_all['dup_order'] = df_all[['Order_x', 'Order_y']].values.tolist()
df_all['dup_order'] = df_all['dup_order'].apply(lambda x: "".join(sorted(x)))
# drop the duplicates and reset the index
df_all = df_all.drop_duplicates(subset=['dup_order'], keep='first')
df_all.reset_index(drop=True)
Create a column calculate the time difference in minutes.
df_all['time'] = (df_all['dt_ceated_x'] - df_all['dt_ceated_y']).abs().astype('timedelta64[m]')
Create a column and calculate the distance between drop offs.
df_all['dropoff'] = df_all.apply(
(lambda row: geodesic(
(row['dropoff_lat_x'], row['dropoff_long_x']),
(row['dropoff_lat_x'], row['dropoff_long_y'])
).miles),
axis=1
)
Create a column and calculate the distance between pickups.
df_all['pickup'] = df_all.apply(
(lambda row: geodesic(
(row['pickup_lat_x'], row['pickup_long_x']),
(row['pickup_lat_x'], row['pickup_long_y'])
).miles),
axis=1
)
Filter the results as desired.
X = 1500
Y = 2000
Z = 100
mask_pickups = df_all['pickup'] < X
mask_dropoff = df_all['dropoff'] < Y
mask_time = df_all['time'] < Z
print(df_all[mask_pickups & mask_dropoff & mask_time][['Order_x', 'Order_y', 'time', 'dropoff', 'pickup']])
Order_x Order_y time dropoff pickup
10 235d 231y 53.0 1059.026620 1059.026620
11 235d 305g 48.0 260.325370 259.275948
13 235d 286n 82.0 249.306279 251.929905
25 231y 305g 5.0 853.308110 854.315567
27 231y 286n 29.0 865.026077 862.126593
34 305g 286n 34.0 11.763787 7.842526

Python3, with pandas.dataframe, how to select certain data by some rules to show

I have a pandas.dataframe, and I want to select certain data by some rules.
The following codes generate the dataframe
import datetime
import pandas as pd
import numpy as np
today = datetime.date.today()
dates = list()
for k in range(10):
a_day = today - datetime.timedelta(days=k)
dates.append(np.datetime64(a_day))
np.random.seed(5)
df = pd.DataFrame(np.random.randint(100, size=(10, 3)),
columns=('other1', 'actual', 'other2'),
index=['{}'.format(i) for i in range(10)])
df.insert(0, 'dates', dates)
df['err_m'] = np.random.rand(10, 1)*0.1
df['std'] = np.random.rand(10, 1)*0.05
df['gain'] = np.random.rand(10, 1)
Now, I want select by the following rules:
1. compute the sum of 'err_m' and 'std', then sort the df so that the sum is descending
2. from the result of step 1, select the part where 'actual' is > 50
Thanks
Create a new column and then sort by this one:
df['errsum'] = df['err_m'] + df['std']
# Return a sorted dataframe
df_sorted = df.sort('errsum', ascending = False)
Select the lines you want
# Create an array with True where the condition is met
selector = df_sorted['errsum'] > 50
# Return a view of sorted_dataframe with only the lines you want
df_sorted[selector]

Python Pandas: bootstrap confidence limits by row rather than entire dataframe

What I am trying to do is to get bootstrap confidence limits by row regardless of the number of rows and make a new dataframe from the output.I currently can do this for the entire dataframe, but not by row. The data I have in my actual program looks similar to what I have below:
0 1 2
0 1 2 3
1 4 1 4
2 1 2 3
3 4 1 4
I want the new dataframe to look something like this with the lower and upper confidence limits:
0 1
0 1 2
1 1 5.5
2 1 4.5
3 1 4.2
The current generated output looks like this:
0 1
0 2.0 2.75
The python 3 code below generates a mock dataframe and generates the bootstrap confidence limits for the entire dataframe. The result is a new dataframe with just 2 values, a upper and a lower confidence limit rather than 4 sets of 2(one for each row).
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a)
b = pd.DataFrame(b)
b = b.T
print(b)
Thank you for any help.
scikits.bootstrap operates by assuming that data samples are arranged by row, not by column. If you want the opposite behavior, just use the transpose, and a statfunction that doesn't combine columns.
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a.T, statfunction=lambda x: np.average(x, axis=0))
print(b.T)
Below is the answer I ended up figuring out to create bootstrap ci by row.
import pandas as pd
import numpy as np
import numpy.random as npr
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
x= zz.dtypes
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
def bootstrap(data, num_samples, statistic, alpha):
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
cc = list(a.index.values) # informs generator of the number of rows
def bootbyrow(cc):
for xx in range(1):
xx = list(a.index.values)
for xx in range(len(cc)):
k = a.apply(lambda y: y[xx])
k = k.values
for xx in range(1):
kk = list(bootstrap(k,10000,np.mean,0.05))
yield list(kk)
abc = pd.DataFrame(list(bootbyrow(cc))) #bootstrap ci by row
# the next 4 just show that its working correctly
a0 = bootstrap((a.loc[0,].values),10000,np.mean,0.05)
a1 = bootstrap((a.loc[1,].values),10000,np.mean,0.05)
a2 = bootstrap((a.loc[2,].values),10000,np.mean,0.05)
a3 = bootstrap((a.loc[3,].values),10000,np.mean,0.05)
print(abc)
print(a0)
print(a1)
print(a2)
print(a3)

Resources