how to I load a pre-batched dataset on pytorch - pytorch

I have a huge dataset that cannot be stored in memory so I prebatched it several files how do I make my dataset and data loader class such that load one bath at a time.
All the files have the same base name and a unique batch number an
Example file would be called o3_batch_1.hdf5 or o3_batch_2.hdf5 the
Largest batch number is o3_batch_102.hdf5
here is what I have tried so far:
would it work?
length would be the total length of the data.
batchNum would be the non-unique number at the end of the file.
base is the common name shared by the file.
class Data(Dataset):
# Constructor
def __init__(self, base, batchNum, length):
name = base + str(batchNum)
with h5py.File(name, "r") as f:
puzz = np.array(f.get('puzzle'))
sol = np.array(f.get('Sol'))
self.puzz = torch.from_numpy(puzz)
self.sol = torch.from_numpy(sol)
self.len = length
# Getter
def __getitem__(self, batchNum, index):
return self.puzz[index], self.sol[index]
# Get length
def __len__(self):
return self.len

I think you can iterate over the Index array, and you can get your data through iteration.
Suppose your file is organized in the following manner
/yourFileDir
o3_batch_1.hdf5
o3_batch_2.hdf5
...
o3_batch_102.hdf5
And your batch Index is 0,1,2,...,102
h5_dir = '/yourFileDir'
for Index in range(103):
with h5py.File(h5_dir + 'o3_batch_{}'.format(Index), 'r') as f:
puzz = np.array(f['puzzle'])
sol = np.array(f['Sol']) # this depends on how you save your data

Related

Working with large multiple datasets where each dataset contains multiple values - Pytorch

I'm training a Neural Network and have overall > 15GB of data inside a folder, the folder has multiple pickle files, and each file contains two lists that each holds multiple values.
This looks like the following:
dataset_folder:\
file.pickle
file_2.pickle
...
...
file_n.pickle
Each file_*.pickle contains a variable length list (list x and list y).
How to load all the data to train the model without having memory issue?
By implementing the custom dataset class provided from Pytorch, we need to implement three methods so pytorch loader can work with your data
__len__
__getitem__
__init__
Let's go through how to implement each one of them seperatly.
__init__
def __init__(self):
# Original Data has the following format
"""
dict_object =
{
"x":[],
"y":[]
}
"""
DIRECTORY = "data/raw"
self.dataset_file_name = os.listdir(DIRECTORY)
self.dataset_file_name_index = 0
self.dataset_length =0
self.prefix_sum_idx = list()
# Loop over each file and calculate the length of overall dataset
# you might need to check if file_name is file
for file_name in os.listdir(DIRECTORY):
with (open(f'{DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
curr_page_sum = len(dict_object["x"]) + len(dict_object["y"])
self.prefix_sum_idx.append(curr_page_sum)
self.dataset_length += curr_page_sum
# prefix sum so we have an idea of where each index appeared in which file.
for i in range (1,len(self.prefix_sum_idx)):
self.prefix_sum_idx[i] = self.prefix_sum_idx[i] + self.prefix_sum_idx[i-1]
assert self.prefix_sum_idx[-1] == self.dataset_length
self.x = []
self.y = []
As you can see above, the main idea is to use prefix sum to "treat" all the dataset as once, so the logic is whenever you need to get access to a specific index later, you simply look into prefix_sum_idx to see this where this idx appear.
In the image above, let's say we need to access the index 150. Thanks to prefix sum, we are now able to know that 150 exist in the second .pickle file. Still we need a fast mechanism to know where that idx exist in the prefix_sum_idx. This will be explained in the __getitem__
__getitem__
def read_pickle_file(self, idx):
file_name = self.dataset_file_name[idx]
dict_object = dict()
with (open(f'{YOUR_DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
self.x = dict_object['x']
self.y = #some logic here
......
# Some logic here....
def __getitem__(self,idx):
# Similar to C++ std::upper_bound - O(log n)
temp = bisect.bisect_right(self.prefix_sum_idx, idx)
self.read_pickle_file(temp)
local_idx = idx - self.prefix_sum_idx[temp]
return self.x[local_idx],self.y[local_idx]
check bisect_right() docs for details on how it works, but simply it returns the rightmost place in the sorted list to insert the given element and keep it sorted. In our approach, we're interested only in the following question, "which file should I access in order to get the appropriate data". More importantly, it does so in O(log n)
__len__
def __len__(self):
return self.dataset_length
In order to get the length of our dataset, we loop through each file in and accumulate the results as shown in __init__.
The full code sample goes like this:
import pickle
import torch
import torch.nn as nn
import numpy
import os
import bisect
from torch.utils.data import Dataset, DataLoader
from src.data.make_dataset import main
from torch.nn import functional as F
class dataset(Dataset):
def __init__(self):
# Original Data has the following format
"""
dict_object =
{
"x":[],
"y":[]
}
"""
DIRECTORY = "data/raw"
self.dataset_file_name = os.listdir(DIRECTORY)
self.dataset_file_name_index = 0
self.dataset_length =0
self.prefix_sum_idx = list()
# Loop over each file and calculate the length of overall dataset
# you might need to check if file_name is file
for file_name in os.listdir(DIRECTORY):
with (open(f'{DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
curr_page_sum = len(dict_object["x"]) + len(dict_object["y"])
self.prefix_sum_idx.append(curr_page_sum)
self.dataset_length += curr_page_sum
# prefix sum so we have an idea of where each index appeared in which file.
for i in range (1,len(self.prefix_sum_idx)):
self.prefix_sum_idx[i] = self.prefix_sum_idx[i] + self.prefix_sum_idx[i-1]
assert self.prefix_sum_idx[-1] == self.dataset_length
self.x = []
self.y = []
def read_pickle_file(self, idx):
file_name = self.dataset_file_name[idx]
dict_object = dict()
with (open(f'{YOUR_DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
self.x = dict_object['x']
self.y = #some logic here
......
# Some logic here....
def __getitem__(self,idx):
# Similar to C++ std::upper_bound - O(log n)
temp = bisect.bisect_right(self.prefix_sum_idx, idx)
self.read_pickle_file(temp)
local_idx = idx - self.prefix_sum_idx[temp]
return self.x[local_idx],self.y[local_idx]
def __len__(self):
return self.dataset_length
large_dataset = dataset()
train_size = int (0.8 * len(large_dataset))
validation_size = len(large_dataset) - train_size
train_dataset, validation_dataset = torch.utils.data.random_split(large_dataset, [train_size, validation_size])
validation_loader = DataLoader(validation_dataset, batch_size=64, num_workers=4, shuffle=False)
train_loader = DataLoader(train_dataset,batch_size=64, num_workers=4,shuffle=False)

Data insert error 10427 conversion of parameters

I'm trying to load some data in Hana 2.0 database from Python 3.x Juypter Notebooks and getting an error.
Please advise.
the type of the column is INT as Pregnancies, the structure is as below
and in the CSV file, the data is a number
So above data should go to Pregnancies column.
Error I'm getting, while inserting data is below:-
hdbcli.dbapi.Error: (-10427, "Conversion of parameter/column (1) from
data type UCS2 (LE) to INT failed (invalid number: not a valid number
string 'Pregnancies')")
its big script..I'm getting issue in insert method
#staticmethod
def insert_data(connection,tablename,cols,inlist,data, batch_size):
sql = 'insert into ' + tablename + inlist
if len(data) > 0:
with connection.connection.cursor() as cur:
rows_inserted = cur.executemany(sql, data)
#staticmethod
def file_load(connection,table_descriptions,cols, inlist, filename,file_count,train_percentage,valid_percentage,test_percentage,batch_size):
with open(filename, 'r') as my_file:
reader = csv.reader(my_file, delimiter=',')
data = list()
data_list = list()
load_count = 0
for row in reader:
remain_count = file_count - load_count
if remain_count < batch_size:
batch_size = remain_count
if len(data) <= batch_size:
data.append(list(row))
if len(data) == batch_size:
DataSets.split_data_into_tables(connection,data,table_descriptions,train_percentage,valid_percentage,test_percentage, cols, inlist, batch_size, file_count)
load_count += len(data)
data = list()
print("Data Loaded:{}%".format(math.floor(load_count/file_count*100)))
whole code for this is in below link
https://github.com/SAP-samples/hana-ml-samples/blob/master/Python-API/pal/notebooks/data_load_utils.py
Based on the error message text, I’m rather sure that the problem is that the import code tries to read the first line of the CSV file (the line that contains the column names/headers) and importing these names to the target table is what fails.
To avoid this, just skip the first line of the CSV file.
the error says that the data in the column is of type 'UCSE' which is a character encoding standard in which characters are represented by a fixed-length 16 bits (2 bytes) and you have defined the columns as 'INT' type.
so change the type to 'INT' in the CSV file first and then load the data.

pd.rename key KeyError: 'New_Name'

Edit 12/07/19: The problem was not in fact with pd.rename fuction but the fact that I did not return from the function the pandas dataframe and as a result the column change did not exist when printing. i.e.
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
return as_pandas <- This was missing*
Please see the user comment below to uptick them for finding this error for me.
Alternatively, you can continue reading.
The data can be downloaded from this link, yet I have added a sample dataset. The formatting of the file is not a typical CSV file and I believe this may have been an assessment piece and is related to Hidden Decision Tree article. I have given the portion of the code as it solves the issues surrounding the format of the text file as mentioned above and allows the user to rename the column.
The problem occured when I tried to assign create a re-naming function:
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
However, it seem to work when I set the variable names inside rename function.
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
Sample Dataset
Title URL Date Unique Pageviews
oupUrl=tutorials 18-Apr-15 5608
"An Exclusive Interview with Data Expert, John Bottega" http://www.datasciencecentral.com/forum/topics/an-exclusive-interview-with-data-expert-john-bottega?groupUrl=announcements 10-Jun-14 360
Announcing Composable Analytics http://www.datasciencecentral.com/forum/topics/announcing-composable-analytics 15-Jun-14 367
Announcing the release of Spark 1.5 http://www.datasciencecentral.com/forum/topics/announcing-the-release-of-spark-1-5 12-Sep-15 156
Are Extreme Weather Events More Frequent? The Data Science Answer http://www.datasciencecentral.com/forum/topics/are-extreme-weather-events-more-frequent-the-data-science-answer 5-Oct-15 204
Are you interested in joining the University of California for an empiricalstudy on 'Big Data'? http://www.datasciencecentral.com/forum/topics/are-you-interested-in-joining-the-university-of-california-for-an 7-Feb-13 204
Are you smart enough to work at Google? http://www.datasciencecentral.com/forum/topics/are-you-smart-enough-to-work-at-google 11-Oct-15 3625
"As a software engineer, what's the best skill set to have for the next 5-10years?" http://www.datasciencecentral.com/forum/topics/as-a-software-engineer-what-s-the-best-skill-set-to-have-for-the- 12-Feb-16 2815
A Statistician's View on Big Data and Data Science (Updated) http://www.datasciencecentral.com/forum/topics/a-statistician-s-view-on-big-data-and-data-science-updated-1 21-May-14 163
A synthetic variance designed for Hadoop and big data http://www.datasciencecentral.com/forum/topics/a-synthetic-variance-designed-for-hadoop-and-big-data?groupUrl=research 26-May-14 575
A Tough Calculus Question http://www.datasciencecentral.com/forum/topics/a-tough-calculus-question 10-Feb-16 937
Attribution Modeling: Key Analytical Strategy to Boost Marketing ROI http://www.datasciencecentral.com/forum/topics/attribution-modeling-key-concept 24-Oct-15 937
Audience expansion http://www.datasciencecentral.com/forum/topics/audience-expansion 6-May-13 223
Automatic use of insights http://www.datasciencecentral.com/forum/topics/automatic-use-of-insights 27-Aug-15 122
Average length of dissertations by higher education discipline. http://www.datasciencecentral.com/forum/topics/average-length-of-dissertations-by-higher-education-discipline 4-Jun-15 1303
This is the full code that produces the Key Error:
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=True)
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'},
inplace=True)
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
# change_column_names(multi_sets)
change_column_names(multi_set, 'Old_Name', 'New_Name')
print(multi_sets)
main()
I cleaned up your code so it would run. You were changing the column names but not returning the result. Try the following:
import pandas as pd
import numpy as np
import math
def set_new_columns(as_pandas):
titles_list = ['Year > 2014', 'Forum', 'Blog', 'Python', 'R',
'Machine_Learning', 'Data_Science', 'Data',
'Analytics']
for number, word in enumerate(titles_list):
as_pandas.insert(len(as_pandas.columns), titles_list[number], 0)
def title_length(as_pandas):
# Insert new column header then count the number of letters in 'Title'
as_pandas.insert(len(as_pandas.columns), 'Title_Length', 0)
as_pandas['Title_Length'] = as_pandas['Title'].map(str).apply(len)
# Although it is log, percentage of change is inverse linear comparison of
#logX1 - logX2
# therefore you could think of it as the percentage change in Page Views
# map
# function allows for function to be performed on all rows in column
# 'Page_Views'.
def log_page_view(as_pandas):
# Insert new column header
as_pandas.insert(len(as_pandas.columns), 'Log_Page_Views', 0)
as_pandas['Log_Page_Views'] = as_pandas['Page_Views'].map(lambda x: math.log(1 + float(x)))
def change_to_numeric(as_pandas):
# Check for missing values then convert the column to numeric.
as_pandas = as_pandas.replace(r'^\s*$', np.nan, regex=True)
as_pandas['Page_Views'] = pd.to_numeric(as_pandas['Page_Views'],
errors='coerce')
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
# i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
import csv
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
multi_sets = change_column_names(multi_sets)
change_to_numeric(multi_sets)
log_page_view(multi_sets)
title_length(multi_sets)
set_new_columns(multi_sets)
print(multi_sets)
main()

Why can't I split files when generating some TFrecord files?

Why can't I split files when generating some TFrecords files?
I'm doing some job predicting protein stuctures. As you may know, one protein molecule might have different strands. So I need to split the list of the atoms into different TFrecords by the strand name.
The problem is, this code ended up by generating several TFrecords with nothing written. All blank.
Or, is there a method to split the strands while training my module? Then I could ignore this problem and put the strand name in the TFrecords as a feature.
'''
with all module imported and no errors raised
'''
def generate_TFrecord(intPosition, endPosition, path):
CrtS = x #x is the name of the current strand
path = path + CrtS
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(intPosition, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeding this dict
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
'''
if checkStrand(i) == False:
writer.write(tf_serialized)
intPosition = i
'''
writer.close()
'''
strand_index is a list of all the startpoint of a single strand
'''
for loop in strand_index:
generate_TFrecord(loop, endPosition, path)
'''
________division___________
This code below works, but only generate a single tfrecord containing all the atom imformations.
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(0, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeing features
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
writer.close()
'''

Python Multiprocessing throwing out results based on previous values

I am trying to learn how to use multiprocessing and have managed to get the code below to work. The goal is to work through every combination of the variables within the CostlyFunction by setting n equal to some number (right now it is 100 so the first 100 combinations are tested). I was hoping I could manipulate w as each process returned its list (CostlyFunction returns a list of 7 values) and only keep the results in a given range. Right now, w holds all 100 lists and then lets me manipulate those lists but, when I use n=10MM, w becomes huge and costly to hold. Is there a way to evaluate CostlyFunction's output as the workers return values and then 'throw out' values I don't need?
if __name__ == "__main__":
import csv
csvFile = open('C:\\Users\\bryan.j.weiner\\Desktop\\test.csv', 'w', newline='')
#width = -36000000/1000
#fronteir = [None]*1000
currtime = time()
n=100
po = Pool()
res = po.map_async(CostlyFunction,((i,) for i in range(n)))
w = res.get()
spamwriter = csv.writer(csvFile, delimiter=',')
spamwriter.writerows(w)
print(('2: parallel: time elapsed:', time() - currtime))
csvFile.close()
Unfortunately, Pool doesn't have a 'filter' method; otherwise, you might've been able to prune your results before they're returned. Pool.imap is probably the best solution you'll find for dealing with your memory issue: it returns an iterator over the results from CostlyFunction.
For sorting through the results, I made a simple list-based class called TopList that stores a fixed number of items. All of its items are the highest-ranked according to a key function.
from collections import Userlist
def keyfunc(a):
return a[5] # This would be the sixth item in a result from CostlyFunction
class TopList(UserList):
def __init__(self, key, *args, cap=10): # cap is the largest number of results
super().__init__(*args) # you want to store
self.cap = cap
self.key = key
def add(self, item):
self.data.append(item)
self.data.sort(key=self.key, reverse=True)
self.data.pop()
Here's how your code might look:
if __name__ == "__main__":
import csv
csvFile = open('C:\\Users\\bryan.j.weiner\\Desktop\\test.csv', 'w', newline='')
n = 100
currtime = time()
po = Pool()
best = TopList(keyfunc)
result_iter = po.imap(CostlyFunction, ((i,) for i in range(n)))
for result in result_iter:
best.add(result)
spamwriter = csv.writer(csvFile, delimiter=',')
spamwriter.writerows(w)
print(('2: parallel: time elapsed:', time() - currtime))
csvFile.close()

Resources