Group by mean for element with value >0 - python-3.x

df=pd.DataFrame({"x":[1,2,3,0],"y":[1,1,1,1]})
df.groupby("y").agg(x_sum=("x",np.mean))
This code gives average of x, the output is 1.5 ((1+2+3+0)/4=1.5)
but I want average of x where the number of larger than 0, so the output should be (1+2+3)/3=2.
How should I address it?

Replace not greater like 0 in x column to NaN:
df.x = df.x.where(df.x.gt(0))
#alternative
#df.x = df.x.mask(df.x.le(0))
print (df)
x y
0 1.0 1
1 2.0 1
2 3.0 1
3 NaN 1
df1 = df.groupby("y").agg(x_sum=("x",np.mean))
print (df1)
x_sum
y
1 2.0

You can write and use your custom function. Example:
import pandas as pd
import numpy as np
def mean_without_zero_values(values):
vals = [v for v in values if v > 0]
return np.mean(vals)
df=pd.DataFrame({"x":[1,2,3,0],"y":[1,1,1,1]})
result = df.groupby("y").agg(x_sum=("x",mean_without_zero_values))
print(result)
# output
# x_sum
# y
# 1 2

Related

Optimizing using Pandas Data Frame

I have the following function that loads a csv into a data frame then does some calculations. It takes about 4-5 minutes to do calculation on the csv with a little over 100,000 lines. I was hoping there is a faster way.
def calculate_adeck_errors(in_file):
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")}')
pd.set_option('display.max_columns', 12)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
#print(adeck_df)
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
#print(carq_data)
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
#print(final_df)
row_list = []
for index, row in carq_data.iterrows():
position_time = row['POSDATETIME']
for index, arow in final_df.iterrows():
if arow['POSDATETIME'] == position_time:
# match, so do calculations
storm_id = arow['STORMID']
model_base_time = arow['MODELDATETIME']
the_hour = arow['TAU']
the_model = arow['MODEL']
point1 = float(row['LAT']), float(row['LON'])
point2 = float(arow['LAT']), float(arow['LON'])
if arow['LAT'] == 0.0:
dist_error = None
else:
dist_error = int(round(haversine(point1, point2, miles=True)))
if arow['WIND'] != 0:
wind_error = int(abs(int(row['WIND']) - int(arow['WIND'])))
else: wind_error = None
if arow['PRES'] != 0:
pressure_error = int(abs(int(row['PRES']) - int(arow['PRES'])))
else:
pressure_error = None
lat_carq = row['LAT']
lon_carq = row['LON']
lat_model = arow['LAT']
lon_model = arow['LON']
wind_carq = row['WIND']
wind_model = arow['WIND']
pres_carq = row['PRES']
pres_model = arow['PRES']
row_list.append([storm_id, model_base_time, the_model, the_hour, lat_carq, lon_carq, lat_model, lon_model, dist_error,
wind_carq, wind_model, wind_error, pres_carq, pres_model, pressure_error])
result_df = pd.DataFrame(row_list)
result_df = result_df.where((pd.notnull(result_df)), None)
result_cols = ['StormID', 'ModelBasetime', 'Model' , 'Tau',
'LatCARQ', 'LonCARQ', 'LatModel', 'LonModel', 'DistError',
'WindCARQ', 'WindModel','WindError',
'PresCARQ', 'PresModel','PresError']
result_df.columns = result_cols
calculate_adeck_errors(infile)
To clarify what I'm doing:
1. The CARQ entries are the control (actual).
2. The other models are the guesses.
3. I'm comparing the control (CARQ) to the guesses to see what their errors are.
4. The basis of the comparison is the MODELBASETIME = POSBASETIME
4. A sample file I'm processing is here: http://vortexweather.com/downloads/adeck/aal062018.csv
I was hoping there is a faster way than i'm doing it, or another pandas method besides iterrows
Many thanks for suggestion.
Bryan
This code takes about 10 seconds to run your entire dataset!
The code looks very similar to what you have written, with the exception that all of the operations within the main_function have been vectorized. See Fast, Flexible, Easy and Intuitive: How to Speed Up Your Pandas Projects
2018-09-13_adeck_error_calculations.ipynb
import pandas as pd
import numpy as np
import datetime
from haversine import haversine
def main_function(df, row):
"""
The main difference here is that everything is vectorized
Returns: DataFrame
"""
df_new = pd.DataFrame()
df_storage = pd.DataFrame()
pos_datetime = df.POSDATETIME.isin([row['POSDATETIME']]) # creates a Boolean map
array_len = len(pos_datetime)
new_index = pos_datetime.index
df_new['StormID'] = df.loc[pos_datetime, 'STORMID']
df_new['ModelBaseTime'] = df.loc[pos_datetime, 'MODELDATETIME']
df_new['Model'] = df.loc[pos_datetime, 'MODEL']
df_new['Tau'] = df.loc[pos_datetime, 'TAU']
# Distance
df_new['LatCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LAT']), index=new_index).loc[pos_datetime, 0]
df_new['LonCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LON']), index=new_index).loc[pos_datetime, 0]
df_new['LatModel'] = df.loc[pos_datetime, 'LAT']
df_new['LonModel'] = df.loc[pos_datetime, 'LON']
def calc_dist_error(row):
return round(haversine((row['LatCARQ'], row['LonCARQ']), (row['LatModel'], row['LonModel']), miles=True)) if row['LatModel'] != 0.0 else None
df_new['DistError'] = df_new.apply(calc_dist_error, axis=1)
# Wind
df_new['WindCARQ'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_new['WindModel'] = df.loc[pos_datetime, 'WIND']
df_storage['row_WIND'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_storage['df_WIND'] = df.loc[pos_datetime, 'WIND']
def wind_error_calc(row):
return (row['row_WIND'] - row['df_WIND']) if row['df_WIND'] != 0 else None
df_new['WindError'] = df_storage.apply(wind_error_calc, axis=1)
# Air Pressure
df_new['PresCARQ'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_new['PresModel'] = df.loc[pos_datetime, 'PRES']
df_storage['row_PRES'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_storage['df_PRES'] = df.loc[pos_datetime, 'PRES']
def pres_error_calc(row):
return abs(row['row_PRES'] - row['df_PRES']) if row['df_PRES'] != 0 else None
df_new['PresError'] = df_storage.apply(pres_error_calc, axis=1)
del(df_storage)
return df_new
def calculate_adeck_errors(in_file):
"""
Retruns: DataFrame
"""
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 300)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
adeck_df['MODELDATETIME'] = pd.to_datetime(adeck_df['MODELDATETIME'], format='%Y-%m-%d %H:%M')
adeck_df['POSDATETIME'] = pd.to_datetime(adeck_df['POSDATETIME'], format='%Y-%m-%d %H:%M')
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
print('Len carq_data: ', len(carq_data))
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
print('Len final_df: ', len(final_df))
df_out_new = pd.DataFrame()
for index, row in carq_data.iterrows():
test_df = main_function(final_df, row) # function call
df_out_new = df_out_new.append(test_df, sort=False)
df_out_new = df_out_new.reset_index(drop=True)
df_out_new = df_out_new.where((pd.notnull(df_out_new)), None)
print(f'Finishing Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
return df_out_new
in_file = 'aal062018.csv'
df = calculate_adeck_errors(in_file)
>>>Starting Data Calculations: 02:18:30AM on September 13, 2018
>>>Len carq_data: 56
>>>Len final_df: 137999
>>>Finishing Data Calculations: 02:18:39AM on September 13, 2018
print(len(df))
>>>95630
print(df.head(20))
Please don't forget to check the accepted solution. Enjoy!
Looks like you are creating two dataframes out of the same dataframe, and then processing them. Two things that may cut your time.
First, you are iterating over both dataframes and checking for a condition:
for _, row in carq_data.iterrows():
for _, arow in final_df.iterrows():
if arow['POSDATETIME'] == row['POSDATETIME']:
# do something by using both tables
This is essentially an implementation of a join. You are joining carq_data with final_df on 'POSDATETIME'.
As a first step, you should merge the tables:
merged = carq_data.merge(final_df, on=['POSDATETIME'])
At this point you will get multiple rows for each similar 'POSDATETIME'. In the below, let's assume column b is POSDATETIME:
>>> a
a b
0 1 11
1 1 33
>>> b
a b
0 1 2
1 1 3
2 1 4
>>> merged = a.merge(b, on=['a'])
>>> merged
a b_x b_y
0 1 11 2
1 1 11 3
2 1 11 4
3 1 33 2
4 1 33 3
5 1 33 4
Now, to do your conditional calculations, you can use the apply() function.
First, define a function:
def calc_dist_error(row):
return int(round(haversine(row['b_x'], row['b_y'], miles=True))) if row['a'] != 0.0 else None
Then apply it to every row:
merged['dist_error'] = merged.apply(calc_dist_error, axis=1)
Continuing my small example:
>>> merged['c'] = [1, 0, 0, 0, 2, 3]
>>> merged
a b_x b_y c
0 1 11 2 1
1 1 11 3 0
2 1 11 4 0
3 1 33 2 0
4 1 33 3 2
5 1 33 4 3
>>> def foo(row):
... return row['b_x'] - row['b_y'] if row['c'] != 0 else None
...
>>> merged['dist_error'] = merged.apply(foo, axis=1)
>>> merged
a b_x b_y c dist_error
0 1 11 2 1 9.0
1 1 11 3 0 NaN
2 1 11 4 0 NaN
3 1 33 2 0 NaN
4 1 33 3 2 30.0
5 1 33 4 3 29.0
This should help you reduce run time (see also this for how to check using %timeit). Hope this helps!

Keras aggregated objective function

How to add aggregated error to keras model?
Having table:
g x y
0 1 1 1
1 1 2 2
2 1 3 3
3 2 1 2
4 2 2 1
I would like to be able to minimize sum((y - y_pred) ** 2) error along with
sum((sum(y) - sum(y_pred)) ** 2) per group.
I'm fine to have bigger individual sample errors, but it is crucial for me to have right totals.
SciPy example:
import pandas as pd
from scipy.optimize import differential_evolution
df = pd.DataFrame({'g': [1, 1, 1, 2, 2], 'x': [1, 2, 3, 1, 2], 'y': [1, 2, 3, 2, 1]})
g = df.groupby('g')
def linear(pars, fit=False):
a, b = pars
df['y_pred'] = a + b * df['x']
if fit:
sample_errors = sum((df['y'] - df['y_pred']) ** 2)
group_errors = sum((g['y'].sum() - g['y_pred'].sum()) ** 2)
total_error = sum(df['y'] - df['y_pred']) ** 2
return sample_errors + group_errors + total_error
else:
return df['y_pred']
pars = differential_evolution(linear, [[0, 10]] * 2, args=[('fit', True)])['x']
print('SAMPLES:\n', df, '\nGROUPS:\n', g.sum(), '\nTOTALS:\n', df.sum())
Output:
SAMPLES:
g x y y_pred
0 1 1 1 1.232
1 1 2 2 1.947
2 1 3 3 2.662
3 2 1 2 1.232
4 2 2 1 1.947
GROUPS:
x y y_pred
g
1 6 6 5.841
2 3 3 3.179
TOTALS:
g 7.000
x 9.000
y 9.000
y_pred 9.020
For grouping, as long as you keep the same groups throughout training, your loss function will not have problems about being not differentiable.
As a naive form of grouping, you can simply separate the batches.
I suggest a generator for that.
#suppose you have these three numpy arrays:
gTrain
xTrain
yTrain
#create this generator
def grouper(g,x,y):
while True:
for gr in range(1,g.max()+1):
indices = g == gr
yield (x[indices],y[indices])
For the loss function, you can make your own:
import keras.backend as K
def customLoss(yTrue,yPred):
return K.sum(K.square(yTrue-yPred)) + K.sum(K.sum(yTrue) - K.sum(yPred))
model.compile(loss=customLoss, ....)
Just be careful with the second term if you have negative values.
Now you train using the method fit_generator:
model.fit_generator(grouper(gTrain,xTrain, yTrain), steps_per_epoch=gTrain.max(), epochs=...)

How do I iterate through data in a panda data frame?

I have created a 5 day moving average for 5 years worth of data. How do I iterate through this to show if the moving average is rising or falling for every single day. My code is simply giving me 1 integer answer rather than a rising (+1) or falling(-1) answer for every day. Thank you!
import pandas as pd
df = pd.read_csv('file:///C:/Users/James Brodie/Desktop/USDJPY.csv', header=1, index_col=0)
ma5 = df['PX_LAST'].rolling(window=5).mean()
ma8 = df['PX_LAST'].rolling(window=8).mean()
ma21 = df['PX_LAST'].rolling(window=21).mean()
ma5x = []
for i in ma5:
if i > i-1:
ma5x = 1
elif i < i-1:
ma5x = -1
else:
ma5x = 0
print(ma5x)
Thank you!
ma5 = [5,2,2,3,3,2,5]
ma5x = []
lastItem = ma5[0]
for currItem in ma5[1:]:
if currItem > lastItem:
ma5x.append(1)
elif currItem < lastItem:
ma5x.append(-1)
else:
ma5x.append(0)
lastItem = currItem
print(ma5x)
gives:
[-1, 0, 1, 0, -1, 1]
The elements of a list are best represented in pandas as a Series object (put own index of choice for rows replacing list(range(len(ma5x))) with anything you need):
print('-----------')
import pandas as pd
pd_ma5x = pd.Series(ma5x, index=list(range(len(ma5x))))
print(pd_ma5x)
gives:
-----------
0 -1
1 0
2 1
3 0
4 -1
5 1
dtype: int64
Using own index be aware that pd_ma5x size is one less than that of ma5

Get length of range in Python List Comprehension

I wonder if it is possible to get the length of the range in a list comprehension in python 3 in order to set up a conditional as such? this code doesn't work
b = [x**2 for x in range(10) if x % 2 == 0 and x > len/2]
>>> n = 10
>>> b = [x**2 for x in range(n) if x % 2 == 0 and x > n/2]
>>> b
[36, 64]

matplotlib pcolormesh plot from x,y,z data

I have data in a textfile in tableform with three columns. I use np.genfromtxt to read all the columns into matplotlib as x, y, z.
I want to create a color meshplot where x and y are the coordinates and z represents the color, i think people refer to such a plot as heatmap.
My code is as follows:
x = np.genfromtxt('mesh.txt', dtype=float, delimiter=' ', usecols = (0))
y = np.genfromtxt('mesh.txt', dtype=float, delimiter=' ', usecols = (1))
z = np.genfromtxt('mesh.txt', dtype=float, delimiter=' ', usecols = (2))
xmesh, ymesh = np.meshgrid(x,y)
diagram1.pcolormesh(xmesh,ymesh,z)
But I get the following error message:
line 7154, in pcolormesh
C = ma.ravel(C[0:Ny-1, 0:Nx-1]) # data point in each cell is value at
IndexError: too many indices
The textfile is as follows:
1 1 5
2 1 4
3 1 2
4 1 6
1 2 6
2 2 2
3 2 1
4 2 9
1 3 7
2 3 4
3 3 3
4 3 5
1 4 3
2 4 4
3 4 7
4 4 6
How is this to solve.
In the example data provided above, x, y, and z can be easily reshaped to get 2D array. The answer below is for someone who is looking for more generalized answer with random x,y, and z arrays.
import matplotlib.pyplot as plt
from matplotlib.mlab import griddata
import numpy
# use your x,y and z arrays here
x = numpy.random.randint(1,30, 50)
y = numpy.random.randint(1,30, 50)
z = numpy.random.randint(1,30, 50)
yy, xx = numpy.meshgrid(y,x)
zz = griddata(x,y,z,xx,yy, interp='linear')
plt.pcolor(zz)
#plt.contourf(xx,yy,zz) # if you want contour plot
#plt.imshow(zz)
plt.pcolorbar()
plt.show()
My guess is that x, y and z will be read as one-dimensional vectors of the same length, let's say N. The problem is that when you create your xmesh and ymesh, they are N x N, which your z values should be as well. It's only N, which is why you are getting an error.
What is the layout of your file? I'm guessing each row is a (x,y,z) that you want to create a mesh from. In order to do this, you need to know how the points are ordered as a mesh (either as row-major or column-major). Once you know this, instead of creating xmesh and ymesh, you can do something like this:
N = np.sqrt(len(x)) # Only if squared, adjust accordingly
x = x.reshape((N, N))
y = y.reshape((N, N))
z = z.reshape((N, N))
pcolormesh(x, y, z)
Before doing this, I would start by doing this:
scatter(x, y, c=z)
which will give you the points of the mesh, which is a good starting point.
I had the same problem and agree with Gustav Larsson's suggestion to use
scatter(x, y, c=z)
In my particular case, I set the linewidths of the scatter points to zero:
scatter(x, y, c=z, linewidths=0)
of course, you can play around with other decorations, color schemes etc., the reference of matplotlib.pyplot.scatter will help you further.
It seems you are plotting X and Y as 2D arrays while Z is still a 1D array. Try something like:
Znew=np.reshape(z,(len(xmesh[:,0]),len(xmesh[0,:])))
diagram1.pcolormesh(xmesh,ymesh,Znew)
Update: Tou have a X/Y grid of size 4x4:
x = np.genfromtxt('mesh.txt', dtype=float, delimiter=' ', usecols = (0))
y = np.genfromtxt('mesh.txt', dtype=float, delimiter=' ', usecols = (1))
z = np.genfromtxt('mesh.txt', dtype=float, delimiter=' ', usecols = (2))
Reshape the arrays as suggestet by #Gustav Larsson and myself like this:
Xnew=np.reshape(x,(4,4))
Xnew=np.reshape(y,(4,4))
Znew=np.reshape(z,(4,4))
Which gives you three 4x4 arrays to plot using pcolormesh:
diagram1.pcolormesh(Xnew,Ynew,Znew)

Resources