Pytorch: Dataloader shuffle=False producing same batches - pytorch

class DataSet(torch.utils.data.Dataset):
def __init__(self,dataframe,n_classes=3,w=384,h=384,apply_aug=False):
self.data=dataframe
self.n_classes=n_classes
self.apply_aug=apply_aug
self.w=w
self.h=h
self.transform=A.Compose([A.Rotate(limit=30,p=0.8),
A.HorizontalFlip(),
A.CoarseDropout(max_height=0.1,max_width=0.1,p=1.0),
A.ShiftScaleRotate(shift_limit=0.09,scale_limit=0.2,rotate_limit=0)
])
def __len__(self):
return self.data.shape[0]
def __getitem__(self,idx):
IMG=np.zeros((1,self.w,self.h),dtype=np.float32)
MASK=np.zeros((self.n_classes,self.w,self.h),dtype=np.float32)
path=self.data.iloc[idx]["image_path"]
encoded_list=self.data.iloc[idx]["segmentation"]
width=int(self.data.iloc[idx]["width"][0])
heigth=int(self.data.iloc[idx]["heigth"][0])
class_list=self.data.iloc[idx]["class"]
img=implt.imread(path)
print(f"idx is {idx}")
IMG[:,:width,:heigth]=img
for class_idx,c in enumerate(class_list):
if str(encoded_list[class_idx])=="nan":
mask=np.zeros((1,width,heigth))
else:
mask=RLE_TO_MASK(encoded_list[class_idx],width,heigth)
MASK[c,:width,:heigth]=mask[0]
if self.apply_aug:
transformed=self.transform(image=IMG,mask=MASK)
IMG,MASK=transformed['image'],transformed['mask']
IMG=IMG/255.0
return IMG,MASK
Above is the dataset function created. It outputs the images and masks.
When I change shuffle=True for data loader, it's working fine but when I change shuffle=False. For the next batch the data loader providing the same batch which is produced before.
dataloader=torch.utils.data.DataLoader(DataSet(df,apply_aug=True),batch_size=BATCH_SIZE,shuffle=False)
for i in range(2):
images,masks=next(iter(dataloader))
print()
print(images.shape,masks.shape)
idx is 0
idx is 1
idx is 2
idx is 3
idx is 4
idx is 5
idx is 6
idx is 7
idx is 8
idx is 9
idx is 10
idx is 11
idx is 12
idx is 13
idx is 14
idx is 15
idx is 16
idx is 17
idx is 18
idx is 19
idx is 20
idx is 21
idx is 22
idx is 23
idx is 24
idx is 25
idx is 26
idx is 27
idx is 28
idx is 29
idx is 30
idx is 31
idx is 0
idx is 1
idx is 2
idx is 3
idx is 4
idx is 5
idx is 6
idx is 7
idx is 8
idx is 9
idx is 10
idx is 11
idx is 12
idx is 13
idx is 14
idx is 15
idx is 16
idx is 17
idx is 18
idx is 19
idx is 20
idx is 21
idx is 22
idx is 23
idx is 24
idx is 25
idx is 26
idx is 27
idx is 28
idx is 29
idx is 30
idx is 31
torch.Size([32, 1, 384, 384]) torch.Size([32, 3, 384, 384])
When shuffle=True
for i in range(2):
images,masks=next(iter(dataloader))
print()
print(images.shape,masks.shape)
idx is 25498
idx is 15357
idx is 11275
idx is 36247
idx is 33223
idx is 8566
idx is 14229
idx is 23999
idx is 28883
idx is 8847
idx is 35485
idx is 36647
idx is 22422
idx is 3693
idx is 32525
idx is 19464
idx is 22187
idx is 38244
idx is 7795
idx is 3690
idx is 7461
idx is 36806
idx is 22455
idx is 6817
idx is 8789
idx is 37809
idx is 33157
idx is 22828
idx is 35858
idx is 38320
idx is 2684
idx is 29708
idx is 38240
idx is 28020
idx is 10356
idx is 20215
idx is 18561
idx is 30083
idx is 30997
idx is 14020
idx is 20896
idx is 25551
idx is 2735
idx is 19138
idx is 23026
idx is 30677
idx is 26664
idx is 2731
idx is 14150
idx is 16735
idx is 28621
idx is 18268
idx is 11793
idx is 35654
idx is 4470
idx is 11312
idx is 37349
idx is 27501
idx is 5389
idx is 34019
idx is 24120
idx is 38311
idx is 14880
idx is 9533
torch.Size([32, 1, 384, 384]) torch.Size([32, 3, 384, 384])

You use iterator incorrectly:
next(iter(dataloader))
Every step you create a new iterator and take the first element (hence it's always the same because the iterator is actually the same). Instead you should create the iterator before for-loop and call next() in every step.
But why not simply iterate over your dataloader this way:
for images,masks in dataloader:
# do sth with data

If you want to use iter(), don't create the generator every time:
gen = iter(dataloader)
for i in range(len(dataloader)):
data = gen.next()

Related

Received KeyError 1 exception from following method:__get_item(self,key) of networkx while trying to read the Graph data

def inflow(G_inflow, sink, neigh_nodes) :
print(f'G_inflow:{list(G_inflow)}')
i = 0
j = 1
z = 0
print(f'neighbour node value: {neigh_nodes}')
cnt = len(neigh_nodes)
print(f'type of neigh nodes: {type(sink),}')
while j <= cnt :
val = G_inflow[sink][j]['flow']
j+=1
return z
Following is the edge structure : {'capacity': 8545, 'flow': 4869}, when trying to access G_inflow[sink][j]['flow'] getting KeyError from __getitem__(self, key). Let me know is there any other way to resolve this issue.
<ipython-input-46-66e02fb8ffba> in inflow(G_inflow, sink, neigh_nodes)
42 cnt = operator.length_hint(neigh_nodes)
43 while j <= cnt :
---> 44 print(f'Sink data {sink}-{j}: {G_inflow[sink][j]}')
45 z += G_inflow[sink][j]['flow']
46 j+=1
/usr/local/lib/python3.7/dist-packages/networkx/classes/coreviews.py in __getitem__(self, key)
52
53 def __getitem__(self, key):
---> 54 return self._atlas[key]
55
56 def copy(self):
KeyError: 1
I have tried to access the 'flow' value of a Graph edge attribute. This access of Graph edge value('flow') is throwing an exception.

Cannot cast ufunc subtract output from dtype('float64') to dtype('int64') with casting rule 'same_kind' despite forced convertion

I have a data Series ts:
0 2599.0
1 2599.0
2 3998.0
3 3998.0
4 1299.0
5 1499.0
6 1499.0
7 2997.5
8 749.5
Name: 0, dtype: float64
and I would like to predict the next period using ARIMA:
import statsmodels.tsa.api as smt
array = []
for i, row in test.iterrows():
print("row['shop_id']: ", row['shop_id'], " row['item_id']: ", row['item_id'])
ts = pd.DataFrame(sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']],[row['item_id']]], :]['item_price'].values*sales_monthly.loc[pd.IndexSlice[:, [row['shop_id']],[row['item_id']]], :]['item_cnt_day'].values).T.iloc[0]
rng = range(5)
for i in rng:
for j in rng:
try:
tmp_mdl = smt.ARMA(ts, order = (i, j)).fit(method='mle', trand='nc')
tmp_aic = tmp_mdl.aic
if tmp_aic < best_aic:
best_aic = tmp_aic
best_order = (i, j)
best_mdl = tmp_mdl
except:
continue
if best_mdl.predict()<0:
y_pred = 0
else:
y_pred = best_mdl.predict()
d = {'id':row['ID'], 'item_cnt_month': y_pred}
array.append(d)
df = pd.DataFrame(array)
df
But I get:
---------------------------------------------------------------------------
UFuncTypeError Traceback (most recent call last)
<ipython-input-104-85dfa2fa67c1> in <module>()
22 except:
23 continue
---> 24 if best_mdl.predict()<0:
25 y_pred = 0
26 else:
3 frames
/usr/local/lib/python3.6/dist-packages/statsmodels/tsa/arima_model.py in geterrors(self, params)
686 k = self.k_exog + self.k_trend
687 if k > 0:
--> 688 y -= dot(self.exog, params[:k])
689
690 k_ar = self.k_ar
UFuncTypeError: Cannot cast ufunc 'subtract' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'
So I used best_mdl.predict().astype('float32') but it didn't changed anything.

To Print each stage of Bubble sort in Python3.6

Sort the given set of numbers using Bubble Sort. The first line of the input contains the number of elements, the second line of the input contains the numbers to be sorted. In the output print the status of the array at the 3rd iteration and the final sorted array in the given format
alist=[]
def bubble_sort(alist):
for i in range(len(alist) - 1, 0, -1):
no_swap = True
for j in range(0, i):
if alist[j + 1] < alist[j]:
alist[j], alist[j + 1] = alist[j + 1], alist[j]
no_swap = False
if no_swap:
return
n=int(input())
for i in range(n):
alist.append(int(input()))
alist = [int(x) for x in alist]
bubble_sort(alist)
print('Sorted array: ', end='\n')
for i in alist:
print(i,end=" ")
Test Case 1
7
64
34
25
12
22
11
90
Expected Output:
It should print the following 3 lines
12 22 11 25 34 64 90
Sorted array:
11 12 22 25 34 64 90
Test Case 2
8
14
83
25
47
9
77
1
0
Expected Output:
It should print the 3 following lines
14 9 25 1 0 47 77 83
Sorted array:
0 1 9 14 25 47 77 83
Just add in your for loop a print when you reach the third iteration
alist=[]
def bubble_sort(alist):
number_of_iterations = 0
for i in range(len(alist) - 1, 0, -1):
no_swap = True
for j in range(0, i):
if alist[j + 1] < alist[j]:
alist[j], alist[j + 1] = alist[j + 1], alist[j]
no_swap = False
if i == len(alist) - 3:
print(*alist) # Using the unpacking operator for pretty print, if you are in python2 you can print it this way : " ".join(map(str, alist))
if no_swap:
return
n=5
alist = [7, 64, 34, 25, 12, 22, 11, 90]
bubble_sort(alist)
print('Sorted array: ', end='\n')
for i in alist:
print(i,end=" ")

Python find indexes of list of lists

I'm trying to find the indexes of all the elements in a python list of lists. The list would look something like this:
list = [[x,y,x],
[x,y,y],
[y,y,y]]
To do this I have tried to use a nested loop like this:
for lst in list:
print(list.index(lst))
for x in lst:
print(lst.index(x))
This however prints out a long list with almost seemingly random numbers.
screenshot of output for a slightly different list
What I'm tring to achieve is an output looking something like this:
0
1
0
2
0
3
1
0
1
0
Is there anyone who can help out a python beginner?
You can enumerate to print indexes
x = 22
y = 34
l = [[x,y,x],
[x,y,y],
[y,y,y]]
indexes = [(i, j) for i, nl in enumerate(l) for j, nle in enumerate(nl)]
print(*indexes, sep="\n")
# output
(0, 0)
(0, 1)
(0, 2)
(1, 0)
(1, 1)
(1, 2)
(2, 0)
(2, 1)
(2, 2)
https://docs.python.org/3/library/functions.html#enumerate
You can easily access index and value with the use of by looping thorugh the list using enumerate. I added some printing with text to show what number is what.
myList = [[1,2,3],
[4,5,6],
[7,8,9]]
for lstIdx, lst in enumerate(myList):
print("list idx: " + str(lstIdx))
for idx, item in enumerate(lst):
print("item idx in list: " + str(idx))
output:
list idx: 0
item idx in list: 0
item idx in list: 1
item idx in list: 2
list idx: 1
item idx in list: 0
item idx in list: 1
item idx in list: 2
list idx: 2
item idx in list: 0
item idx in list: 1
item idx in list: 2

Optimizing using Pandas Data Frame

I have the following function that loads a csv into a data frame then does some calculations. It takes about 4-5 minutes to do calculation on the csv with a little over 100,000 lines. I was hoping there is a faster way.
def calculate_adeck_errors(in_file):
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")}')
pd.set_option('display.max_columns', 12)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
#print(adeck_df)
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
#print(carq_data)
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
#print(final_df)
row_list = []
for index, row in carq_data.iterrows():
position_time = row['POSDATETIME']
for index, arow in final_df.iterrows():
if arow['POSDATETIME'] == position_time:
# match, so do calculations
storm_id = arow['STORMID']
model_base_time = arow['MODELDATETIME']
the_hour = arow['TAU']
the_model = arow['MODEL']
point1 = float(row['LAT']), float(row['LON'])
point2 = float(arow['LAT']), float(arow['LON'])
if arow['LAT'] == 0.0:
dist_error = None
else:
dist_error = int(round(haversine(point1, point2, miles=True)))
if arow['WIND'] != 0:
wind_error = int(abs(int(row['WIND']) - int(arow['WIND'])))
else: wind_error = None
if arow['PRES'] != 0:
pressure_error = int(abs(int(row['PRES']) - int(arow['PRES'])))
else:
pressure_error = None
lat_carq = row['LAT']
lon_carq = row['LON']
lat_model = arow['LAT']
lon_model = arow['LON']
wind_carq = row['WIND']
wind_model = arow['WIND']
pres_carq = row['PRES']
pres_model = arow['PRES']
row_list.append([storm_id, model_base_time, the_model, the_hour, lat_carq, lon_carq, lat_model, lon_model, dist_error,
wind_carq, wind_model, wind_error, pres_carq, pres_model, pressure_error])
result_df = pd.DataFrame(row_list)
result_df = result_df.where((pd.notnull(result_df)), None)
result_cols = ['StormID', 'ModelBasetime', 'Model' , 'Tau',
'LatCARQ', 'LonCARQ', 'LatModel', 'LonModel', 'DistError',
'WindCARQ', 'WindModel','WindError',
'PresCARQ', 'PresModel','PresError']
result_df.columns = result_cols
calculate_adeck_errors(infile)
To clarify what I'm doing:
1. The CARQ entries are the control (actual).
2. The other models are the guesses.
3. I'm comparing the control (CARQ) to the guesses to see what their errors are.
4. The basis of the comparison is the MODELBASETIME = POSBASETIME
4. A sample file I'm processing is here: http://vortexweather.com/downloads/adeck/aal062018.csv
I was hoping there is a faster way than i'm doing it, or another pandas method besides iterrows
Many thanks for suggestion.
Bryan
This code takes about 10 seconds to run your entire dataset!
The code looks very similar to what you have written, with the exception that all of the operations within the main_function have been vectorized. See Fast, Flexible, Easy and Intuitive: How to Speed Up Your Pandas Projects
2018-09-13_adeck_error_calculations.ipynb
import pandas as pd
import numpy as np
import datetime
from haversine import haversine
def main_function(df, row):
"""
The main difference here is that everything is vectorized
Returns: DataFrame
"""
df_new = pd.DataFrame()
df_storage = pd.DataFrame()
pos_datetime = df.POSDATETIME.isin([row['POSDATETIME']]) # creates a Boolean map
array_len = len(pos_datetime)
new_index = pos_datetime.index
df_new['StormID'] = df.loc[pos_datetime, 'STORMID']
df_new['ModelBaseTime'] = df.loc[pos_datetime, 'MODELDATETIME']
df_new['Model'] = df.loc[pos_datetime, 'MODEL']
df_new['Tau'] = df.loc[pos_datetime, 'TAU']
# Distance
df_new['LatCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LAT']), index=new_index).loc[pos_datetime, 0]
df_new['LonCARQ'] = pd.DataFrame(np.full((array_len, 1), row['LON']), index=new_index).loc[pos_datetime, 0]
df_new['LatModel'] = df.loc[pos_datetime, 'LAT']
df_new['LonModel'] = df.loc[pos_datetime, 'LON']
def calc_dist_error(row):
return round(haversine((row['LatCARQ'], row['LonCARQ']), (row['LatModel'], row['LonModel']), miles=True)) if row['LatModel'] != 0.0 else None
df_new['DistError'] = df_new.apply(calc_dist_error, axis=1)
# Wind
df_new['WindCARQ'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_new['WindModel'] = df.loc[pos_datetime, 'WIND']
df_storage['row_WIND'] = pd.DataFrame(np.full((array_len, 1), row['WIND']), index=new_index).loc[pos_datetime, 0]
df_storage['df_WIND'] = df.loc[pos_datetime, 'WIND']
def wind_error_calc(row):
return (row['row_WIND'] - row['df_WIND']) if row['df_WIND'] != 0 else None
df_new['WindError'] = df_storage.apply(wind_error_calc, axis=1)
# Air Pressure
df_new['PresCARQ'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_new['PresModel'] = df.loc[pos_datetime, 'PRES']
df_storage['row_PRES'] = pd.DataFrame(np.full((array_len, 1), row['PRES']), index=new_index).loc[pos_datetime, 0]
df_storage['df_PRES'] = df.loc[pos_datetime, 'PRES']
def pres_error_calc(row):
return abs(row['row_PRES'] - row['df_PRES']) if row['df_PRES'] != 0 else None
df_new['PresError'] = df_storage.apply(pres_error_calc, axis=1)
del(df_storage)
return df_new
def calculate_adeck_errors(in_file):
"""
Retruns: DataFrame
"""
print(f'Starting Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
pd.set_option('max_columns', 20)
pd.set_option('max_rows', 300)
# read in the raw csv
adeck_df = pd.read_csv(in_file)
adeck_df['MODELDATETIME'] = pd.to_datetime(adeck_df['MODELDATETIME'], format='%Y-%m-%d %H:%M')
adeck_df['POSDATETIME'] = pd.to_datetime(adeck_df['POSDATETIME'], format='%Y-%m-%d %H:%M')
#extract only the carq items and remove duplicates
carq_data = adeck_df[(adeck_df.MODEL == 'CARQ') & (adeck_df.TAU == 0)].drop_duplicates(keep='last')
print('Len carq_data: ', len(carq_data))
#remove carq items from original
final_df = adeck_df[adeck_df.MODEL != 'CARQ']
print('Len final_df: ', len(final_df))
df_out_new = pd.DataFrame()
for index, row in carq_data.iterrows():
test_df = main_function(final_df, row) # function call
df_out_new = df_out_new.append(test_df, sort=False)
df_out_new = df_out_new.reset_index(drop=True)
df_out_new = df_out_new.where((pd.notnull(df_out_new)), None)
print(f'Finishing Data Calculations: {datetime.datetime.now().strftime("%I:%M:%S%p on %B %d, %Y")}')
return df_out_new
in_file = 'aal062018.csv'
df = calculate_adeck_errors(in_file)
>>>Starting Data Calculations: 02:18:30AM on September 13, 2018
>>>Len carq_data: 56
>>>Len final_df: 137999
>>>Finishing Data Calculations: 02:18:39AM on September 13, 2018
print(len(df))
>>>95630
print(df.head(20))
Please don't forget to check the accepted solution. Enjoy!
Looks like you are creating two dataframes out of the same dataframe, and then processing them. Two things that may cut your time.
First, you are iterating over both dataframes and checking for a condition:
for _, row in carq_data.iterrows():
for _, arow in final_df.iterrows():
if arow['POSDATETIME'] == row['POSDATETIME']:
# do something by using both tables
This is essentially an implementation of a join. You are joining carq_data with final_df on 'POSDATETIME'.
As a first step, you should merge the tables:
merged = carq_data.merge(final_df, on=['POSDATETIME'])
At this point you will get multiple rows for each similar 'POSDATETIME'. In the below, let's assume column b is POSDATETIME:
>>> a
a b
0 1 11
1 1 33
>>> b
a b
0 1 2
1 1 3
2 1 4
>>> merged = a.merge(b, on=['a'])
>>> merged
a b_x b_y
0 1 11 2
1 1 11 3
2 1 11 4
3 1 33 2
4 1 33 3
5 1 33 4
Now, to do your conditional calculations, you can use the apply() function.
First, define a function:
def calc_dist_error(row):
return int(round(haversine(row['b_x'], row['b_y'], miles=True))) if row['a'] != 0.0 else None
Then apply it to every row:
merged['dist_error'] = merged.apply(calc_dist_error, axis=1)
Continuing my small example:
>>> merged['c'] = [1, 0, 0, 0, 2, 3]
>>> merged
a b_x b_y c
0 1 11 2 1
1 1 11 3 0
2 1 11 4 0
3 1 33 2 0
4 1 33 3 2
5 1 33 4 3
>>> def foo(row):
... return row['b_x'] - row['b_y'] if row['c'] != 0 else None
...
>>> merged['dist_error'] = merged.apply(foo, axis=1)
>>> merged
a b_x b_y c dist_error
0 1 11 2 1 9.0
1 1 11 3 0 NaN
2 1 11 4 0 NaN
3 1 33 2 0 NaN
4 1 33 3 2 30.0
5 1 33 4 3 29.0
This should help you reduce run time (see also this for how to check using %timeit). Hope this helps!

Resources