Why do I get a view error when enumerating a Dataframe - python-3.x

Why do I get a "view" error:
ndf = pd.DataFrame()
ndf['Signals'] = [1,1,1,1,1,0,0,0,0,0]
signals_diff = ndf.Signals.diff()
ndf['Revals'] = [101,102,105,104,105,106,107,108,109,109]
ndf['Entry'] = 0
for i,element in enumerate(signals_diff):
if (i==0):
ndf.iloc[i]['Entry'] = ndf.iloc[i]['Revals']
elif (element == 0):
ndf.iloc[i]['Entry'] = ndf.iloc[i - 1]['Entry']
else:
ndf.iloc[i]['Entry'] = ndf.iloc[i]['Revals']
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
ndf.iloc[i]['Entry'] = ndf.iloc[i]['Revals']

instead of iloc use loc:
ndf = pd.DataFrame()
ndf['Signals'] = [1,1,1,1,1,0,0,0,0,0]
signals_diff = ndf.Signals.diff()
ndf['Revals'] = [101,102,105,104,105,106,107,108,109,109]
ndf['Entry'] = 0
for i,element in enumerate(signals_diff):
if (i==0):
ndf.loc[i,'Entry'] = ndf.loc[i,'Revals']
elif (element == 0):
ndf.loc[i,'Entry'] = ndf.loc[i - 1,'Entry']
else:
ndf.loc[i,'Entry'] = ndf.loc[i,'Revals']
This will solve the problem but when assigning, the index should be same. So because of the index thing you might not be able to get the expected result.

Do not chain indexes like ndf.iloc[i]['Entry'] when trying to assign something. See why does that not work.
That said, your code can be rewrite as:
ndf['Entry'] = ndf['Revals'].where(signals_diff != 0).ffill()
Output:
Signals Revals Entry
0 1 101 101.0
1 1 102 101.0
2 1 105 101.0
3 1 104 101.0
4 1 105 101.0
5 0 106 106.0
6 0 107 106.0
7 0 108 106.0
8 0 109 106.0
9 0 109 106.0

Let us keep using the index position slice with get_indexer
for i,element in enumerate(signals_diff):
if (i==0):
ndf.iloc[i,ndf.columns.get_indexer(['Entry'])] = ndf.iloc[i,ndf.columns.get_indexer(['Revals'])]
elif (element == 0):
ndf.iloc[i,ndf.columns.get_indexer(['Entry'])] = ndf.iloc[i - 1,ndf.columns.get_indexer(['Entry'])]
else:
ndf.iloc[i,ndf.columns.get_indexer(['Entry'])] = ndf.iloc[i,ndf.columns.get_indexer(['Revals'])]

Related

Hackerrank - why is my output being written one character at a time?

I am solving the following "Vertical Sticks "hackerrank challenge: https://www.hackerrank.com/challenges/vertical-sticks/problem?isFullScreen=true&h_r=next-challenge&h_v=zen&h_r=next-challenge&h_v=zen
Here is my solution:
def solve(y):
out = []
x = list(itertools.permutations(y))
for yp in x:
arr = []
arr.append(1)
for i in range(int(1),int(len(yp))):
#flag = 0
for j in range(int(i-1),int(-1),int(-1)):
if yp[j] >= yp[i]:
arr.append(i-j)
#flag+=1
break
if j==0:
arr.append(i+1)
out.append(sum(arr))
p = round((sum(out)/len(out)),2)
pp = "%0.2f" % (p)
print(pp)
return pp
if __name__ == '__main__':
fptr = open(os.environ['OUTPUT_PATH'], 'w')
t = int(input().strip())
for t_itr in range(t):
y_count = int(input().strip())
y = list(map(int, input().rstrip().split()))
result = solve(y)
fptr.write('\n'.join(map(str, result)))
fptr.write('\n')
fptr.close()
My print(pp) output comes out correctly for the test case as:
4.33
3.00
4.00
6.00
5.80
11.15
But my return pp stdout comes out as:
4
.
3
3
3
.
0
0
4
.
0
0
6
.
0
0
5
.
8
0
1
1
.
1
5
i.e. one character per line, and is classified incorrect. Could somebody point me into the direction of why this is?
The return from solve is already a string. When you call join on it, you are splitting it into its individual characters, separated by newlines.

Perform operations on a dataframe with map and reduce

I have a dataframe with the items sold by different stores every day:
date date_block_num shop_id item_id item_price item_cnt_day
0 02.01.2013 0 59 22154 999.00 1.0
1 03.01.2013 0 25 2552 899.00 1.0
2 05.01.2013 0 25 2552 899.00 -1.0
3 06.01.2013 0 25 2554 1709.05 1.0
4 15.01.2013 0 25 2555 1099.00 1.0
I would like to get the results for each designated store with shop_id with filter, map and reduce.
So I tried:
each_shop = filter(lambda n: n==transactions.shop_id, transactions)
results = map(lambda n: transactions.item_price*transactions.item_cnt_day, each_shop)
for result in results:
print(result)
But got:
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py:798: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = getattr(x, name)(y)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-21-4e86d47b9f0d> in <module>()
3 each_shop = filter(lambda n: n==transactions.shop_id, transactions)
4 results = map(lambda n: transactions.item_price*transactions.item_cnt_day, each_shop)
----> 5 for result in results:
6 print(result)
7
<ipython-input-21-4e86d47b9f0d> in <lambda>(n)
1 # YOUR CODE GOES HERE
2 # map
----> 3 each_shop = filter(lambda n: n==transactions.shop_id, transactions)
4 results = map(lambda n: transactions.item_price*transactions.item_cnt_day, each_shop)
5 for result in results:
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in wrapper(self, other, axis)
859
860 with np.errstate(all='ignore'):
--> 861 res = na_op(values, other)
862 if is_scalar(res):
863 raise TypeError('Could not compare %s type with Series' %
/opt/conda/lib/python3.6/site-packages/pandas/core/ops.py in na_op(x, y)
798 result = getattr(x, name)(y)
799 if result is NotImplemented:
--> 800 raise TypeError("invalid type comparison")
801 except AttributeError:
802 result = op(x, y)
TypeError: invalid type comparison

Optimizing using Pandas Data Frame

I have the following function that loads a csv into a data frame then does some calculations. It takes about 4-5 minutes to do calculation on the csv with a little over 100,000 lines. I was hoping there is a faster way.
def calculate_adeck_errors(in_file):
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")}')
pd.set_option('display.max_columns', 12)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
#print(adeck_df)
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
#print(carq_data)
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
#print(final_df)
row_list = []
for index, row in carq_data.iterrows():
position_time = row['POSDATETIME']
for index, arow in final_df.iterrows():
if arow['POSDATETIME'] == position_time:
# match, so do calculations
storm_id = arow['STORMID']
model_base_time = arow['MODELDATETIME']
the_hour = arow['TAU']
the_model = arow['MODEL']
point1 = float(row['LAT']), float(row['LON'])
point2 = float(arow['LAT']), float(arow['LON'])
if arow['LAT'] == 0.0:
dist_error = None
else:
dist_error = int(round(haversine(point1, point2, miles=True)))
if arow['WIND'] != 0:
wind_error = int(abs(int(row['WIND']) - int(arow['WIND'])))
else: wind_error = None
if arow['PRES'] != 0:
pressure_error = int(abs(int(row['PRES']) - int(arow['PRES'])))
else:
pressure_error = None
lat_carq = row['LAT']
lon_carq = row['LON']
lat_model = arow['LAT']
lon_model = arow['LON']
wind_carq = row['WIND']
wind_model = arow['WIND']
pres_carq = row['PRES']
pres_model = arow['PRES']
row_list.append([storm_id, model_base_time, the_model, the_hour, lat_carq, lon_carq, lat_model, lon_model, dist_error,
wind_carq, wind_model, wind_error, pres_carq, pres_model, pressure_error])
result_df = pd.DataFrame(row_list)
result_df = result_df.where((pd.notnull(result_df)), None)
result_cols = ['StormID', 'ModelBasetime', 'Model' , 'Tau',
'LatCARQ', 'LonCARQ', 'LatModel', 'LonModel', 'DistError',
'WindCARQ', 'WindModel','WindError',
'PresCARQ', 'PresModel','PresError']
result_df.columns = result_cols
calculate_adeck_errors(infile)
To clarify what I'm doing:
1. The CARQ entries are the control (actual).
2. The other models are the guesses.
3. I'm comparing the control (CARQ) to the guesses to see what their errors are.
4. The basis of the comparison is the MODELBASETIME = POSBASETIME
4. A sample file I'm processing is here: http://vortexweather.com/downloads/adeck/aal062018.csv
I was hoping there is a faster way than i'm doing it, or another pandas method besides iterrows
Many thanks for suggestion.
Bryan
This code takes about 10 seconds to run your entire dataset!
The code looks very similar to what you have written, with the exception that all of the operations within the main_function have been vectorized. See Fast, Flexible, Easy and Intuitive: How to Speed Up Your Pandas Projects
2018-09-13_adeck_error_calculations.ipynb
import pandas as pd
import numpy as np
import datetime
from haversine import haversine
def main_function(df, row):
"""
The main difference here is that everything is vectorized
Returns: DataFrame
"""
df_new = pd.DataFrame()
df_storage = pd.DataFrame()
pos_datetime = df.POSDATETIME.isin([row['POSDATETIME']]) # creates a Boolean map
array_len = len(pos_datetime)
new_index = pos_datetime.index
df_new['StormID'] = df.loc[pos_datetime, 'STORMID']
df_new['ModelBaseTime'] = df.loc[pos_datetime, 'MODELDATETIME']
df_new['Model'] = df.loc[pos_datetime, 'MODEL']
df_new['Tau'] = df.loc[pos_datetime, 'TAU']
# Distance
df_new['LatCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LAT']), index=new_index).loc[pos_datetime, 0]
df_new['LonCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LON']), index=new_index).loc[pos_datetime, 0]
df_new['LatModel'] = df.loc[pos_datetime, 'LAT']
df_new['LonModel'] = df.loc[pos_datetime, 'LON']
def calc_dist_error(row):
return round(haversine((row['LatCARQ'], row['LonCARQ']), (row['LatModel'], row['LonModel']), miles=True)) if row['LatModel'] != 0.0 else None
df_new['DistError'] = df_new.apply(calc_dist_error, axis=1)
# Wind
df_new['WindCARQ'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_new['WindModel'] = df.loc[pos_datetime, 'WIND']
df_storage['row_WIND'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_storage['df_WIND'] = df.loc[pos_datetime, 'WIND']
def wind_error_calc(row):
return (row['row_WIND'] - row['df_WIND']) if row['df_WIND'] != 0 else None
df_new['WindError'] = df_storage.apply(wind_error_calc, axis=1)
# Air Pressure
df_new['PresCARQ'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_new['PresModel'] = df.loc[pos_datetime, 'PRES']
df_storage['row_PRES'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_storage['df_PRES'] = df.loc[pos_datetime, 'PRES']
def pres_error_calc(row):
return abs(row['row_PRES'] - row['df_PRES']) if row['df_PRES'] != 0 else None
df_new['PresError'] = df_storage.apply(pres_error_calc, axis=1)
del(df_storage)
return df_new
def calculate_adeck_errors(in_file):
"""
Retruns: DataFrame
"""
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 300)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
adeck_df['MODELDATETIME'] = pd.to_datetime(adeck_df['MODELDATETIME'], format='%Y-%m-%d %H:%M')
adeck_df['POSDATETIME'] = pd.to_datetime(adeck_df['POSDATETIME'], format='%Y-%m-%d %H:%M')
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
print('Len carq_data: ', len(carq_data))
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
print('Len final_df: ', len(final_df))
df_out_new = pd.DataFrame()
for index, row in carq_data.iterrows():
test_df = main_function(final_df, row) # function call
df_out_new = df_out_new.append(test_df, sort=False)
df_out_new = df_out_new.reset_index(drop=True)
df_out_new = df_out_new.where((pd.notnull(df_out_new)), None)
print(f'Finishing Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
return df_out_new
in_file = 'aal062018.csv'
df = calculate_adeck_errors(in_file)
>>>Starting Data Calculations: 02:18:30AM on September 13, 2018
>>>Len carq_data: 56
>>>Len final_df: 137999
>>>Finishing Data Calculations: 02:18:39AM on September 13, 2018
print(len(df))
>>>95630
print(df.head(20))
Please don't forget to check the accepted solution. Enjoy!
Looks like you are creating two dataframes out of the same dataframe, and then processing them. Two things that may cut your time.
First, you are iterating over both dataframes and checking for a condition:
for _, row in carq_data.iterrows():
for _, arow in final_df.iterrows():
if arow['POSDATETIME'] == row['POSDATETIME']:
# do something by using both tables
This is essentially an implementation of a join. You are joining carq_data with final_df on 'POSDATETIME'.
As a first step, you should merge the tables:
merged = carq_data.merge(final_df, on=['POSDATETIME'])
At this point you will get multiple rows for each similar 'POSDATETIME'. In the below, let's assume column b is POSDATETIME:
>>> a
a b
0 1 11
1 1 33
>>> b
a b
0 1 2
1 1 3
2 1 4
>>> merged = a.merge(b, on=['a'])
>>> merged
a b_x b_y
0 1 11 2
1 1 11 3
2 1 11 4
3 1 33 2
4 1 33 3
5 1 33 4
Now, to do your conditional calculations, you can use the apply() function.
First, define a function:
def calc_dist_error(row):
return int(round(haversine(row['b_x'], row['b_y'], miles=True))) if row['a'] != 0.0 else None
Then apply it to every row:
merged['dist_error'] = merged.apply(calc_dist_error, axis=1)
Continuing my small example:
>>> merged['c'] = [1, 0, 0, 0, 2, 3]
>>> merged
a b_x b_y c
0 1 11 2 1
1 1 11 3 0
2 1 11 4 0
3 1 33 2 0
4 1 33 3 2
5 1 33 4 3
>>> def foo(row):
... return row['b_x'] - row['b_y'] if row['c'] != 0 else None
...
>>> merged['dist_error'] = merged.apply(foo, axis=1)
>>> merged
a b_x b_y c dist_error
0 1 11 2 1 9.0
1 1 11 3 0 NaN
2 1 11 4 0 NaN
3 1 33 2 0 NaN
4 1 33 3 2 30.0
5 1 33 4 3 29.0
This should help you reduce run time (see also this for how to check using %timeit). Hope this helps!

TypeError: ("Cannot compare type 'Timestamp' with type 'str'", 'occurred at index 262224')

I am trying to create a flag for date from datetime column. but getting an error after applying the below function.
def f(r):
if r['balance_dt'] <= '2016-11-30':
return 0
else:
return 1
df_obctohdfc['balance_dt_flag'] = df_obctohdfc.apply(f,axis=1)
The error your are getting is because you are comparing string object to datetime object. You can convert the string to datetime.
Ex:
import datetime
def f(r):
if r['balance_dt'] <= datetime.datetime.strptime('2016-11-30', '%Y-%m-%d'):
return 0
else:
return 1
df_obctohdfc['balance_dt_flag'] = df_obctohdfc.apply(f,axis=1)
Note: It is better to do the way jezrael has mention. That is the right way to do it
In pandas is best avoid loops, how working apply under the hood.
I think need convert string to datetime and then cast mask to integer - True to 1 and False to 0 and change <= to >:
timestamp = pd.to_datetime('2016-11-30')
df_obctohdfc['balance_dt_flag'] = (df_obctohdfc['balance_dt'] > timestamp).astype(int)
Sample:
rng = pd.date_range('2016-11-27', periods=10)
df_obctohdfc = pd.DataFrame({'balance_dt': rng})
#print (df_obctohdfc)
timestamp = pd.to_datetime('2016-11-30')
df_obctohdfc['balance_dt_flag'] = (df_obctohdfc['balance_dt'] > timestamp).astype(int)
print (df_obctohdfc)
balance_dt balance_dt_flag
0 2016-11-27 0
1 2016-11-28 0
2 2016-11-29 0
3 2016-11-30 0
4 2016-12-01 1
5 2016-12-02 1
6 2016-12-03 1
7 2016-12-04 1
8 2016-12-05 1
9 2016-12-06 1
Comparing in 1000 rows DataFrame:
In [140]: %timeit df_obctohdfc['balance_dt_flag1'] = (df_obctohdfc['balance_dt'] > timestamp).astype(int)
1000 loops, best of 3: 368 µs per loop
In [141]: %timeit df_obctohdfc['balance_dt_flag2'] = df_obctohdfc.apply(f,axis=1)
10 loops, best of 3: 91.2 ms per loop
Setup:
rng = pd.date_range('2015-11-01', periods=1000)
df_obctohdfc = pd.DataFrame({'balance_dt': rng})
#print (df_obctohdfc)
timestamp = pd.to_datetime('2016-11-30')
import datetime
def f(r):
if r['balance_dt'] <= datetime.datetime.strptime('2016-11-30', '%Y-%m-%d'):
return 0
else:
return 1

creating objects with attributes from a text file

Im making a program that reads the info of some football teams from a textile(named 30eapril.txt) and uses this data when creating some team-objects. I wonder how I can make the program read the number of teams in the textile and create objects of them. The code I've written so far works but has a lot of repetitive parts!
class team:
def __init__(self, teamdata):
self.name = teamdata[0]
self.wins = teamdata[1]
self.drawn = teamdata[2]
self.losses = teamdata[3]
def __repr__(self):
return self.name.ljust(15) + '{} {} {}'.format(self.wins, self.drawn, self.losses)
laglista = []
with open('30eapril.txt', 'rt') as file:
for line in file:
laglista.append(line)
team1data = (laglista[0]).split()
team2data = (laglista[1]).split()
team3data = (laglista[2]).split()
team4data = (laglista[3]).split()
lag1 = team(team1data)
lag2 = team(team2data)
lag3 = team(team3data)
lag4 = team(team4data)
print(lag1)
print(lag2)
print(lag3)
print(lag4)
this is what was in the textfile
Arsenal 2 1 0
Manchester 2 0 0
Liverpool 0 1 2
Newcastle 0 0 2
Hope that someone can help!
//Peter
Shortened code: (could certainly be even better)
#!/usr/bin/env python3
class team:
def __init__(self, teamdata):
self.name, self.wins, self.drawn, self.losses = teamdata
def __repr__(self):
return self.name.ljust(15) + '{} {} {}'.format(self.wins, self.drawn, self.losses)
lag = []
with open('30eapril.txt', 'rt') as file:
for line in file:
lag.append(team(line.split()))
#print("Number of teams: " + str(len(lag)))
for l in lag:
print(l)
You don't need to know the number of lines of your file.
With the same content of '30eapril.txt', the output is:
$ ./test_script3.py
Arsenal 2 1 0
Manchester 2 0 0
Liverpool 0 1 2
Newcastle 0 0 2
Same script on '30eapril.txt' having a extra line:
$ ./test_script3.py
Arsenal 2 1 0
Manchester 2 0 0
Liverpool 0 1 2
Newcastle 0 0 2
AnotherClub 1 0 2

Resources