torch dataloader for large csv file - incremental loading - pytorch

I am trying to write a custom torch data loader so that large CSV files can be loaded incrementally (by chunks).
I have a rough idea of how to do that. However, I keep getting some PyTorch error that I do not know how to solve.
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
# Create dummy csv data
nb_samples = 110
a = np.arange(nb_samples)
df = pd.DataFrame(a, columns=['data'])
df.to_csv('data.csv', index=False)
# Create Dataset
class CSVDataset(Dataset):
def __init__(self, path, chunksize, nb_samples):
self.path = path
self.chunksize = chunksize
self.len = nb_samples / self.chunksize
def __getitem__(self, index):
x = next(
pd.read_csv(
self.path,
skiprows=index * self.chunksize + 1, #+1, since we skip the header
chunksize=self.chunksize,
names=['data']))
x = torch.from_numpy(x.data.values)
return x
def __len__(self):
return self.len
dataset = CSVDataset('data.csv', chunksize=10, nb_samples=nb_samples)
loader = DataLoader(dataset, batch_size=10, num_workers=1, shuffle=False)
for batch_idx, data in enumerate(loader):
print('batch: {}\tdata: {}'.format(batch_idx, data))
I get 'float' object cannot be interpreted as an integer error

The error is caused by this line:
self.len = nb_samples / self.chunksize
When dividing using / the result is always a float. But you can only return an integer in the __len__() function. Therefore you have to round self.len and/or convert it to an integer. For example by simply doing this:
self.len = nb_samples // self.chunksize
the double slash (//) rounds down and converts to integer.
Edit:
You acutally CAN return a float in __len__() but when calling len(dataset) the error will occur. So I guess len(dataset) is called somewhere inside the DataLoader class.

Related

Cannot pickle dateparser using cloudpickle

I'm using the dateparser library to parse some strings and return potential dates. I need to use cloudpickle for distributed use but am receiving an error:
import dateparser
class DateParser:
def __init__(self,
threshold: float = 0.5,
pos_label: str = 'date'):
self.threshold = threshold
self.pos_label = pos_label
def __call__(self):
dateparser.parse('20/12/2022')
date_parser = DateParser()
with open('/path/parser.cloudpickle', 'wb+') as fout:
cloudpickle.dump(date_parser, fout, protocol=4)
TypeError: can't pickle _thread.lock objects
However when i use plain pickle it works just fine:
import pickle
with open('/path/parser.pickle', 'wb+') as fout:
pickle.dump(date_parser, fout, protocol=4)
# also loads just fine:
with open('/path/parser.pickle', 'rb+') as fin:
pickle.load(fin)
I can get around this issue by importing dateparser in the init of Dateparser but I'm not sure why this should be the fix.
class DateParser:
def __init__(self,
threshold: float = 0.5,
pos_label: str = 'date'):
import dateparser
self.threshold = threshold
self.pos_label = pos_label
I looked online and it seems this threadlock complaint is most common to multiprocessing calls but as far as i can tell this doesn't happen in the underlying dateparser library. And this should've broken plain pickling anyway?

how to fix "Missing mandatory keyword argument 'size' in function call" pylint(missing-kwoa) E1125

I want to pass an array, int to a function of different file and I want to return int(1 to 9)
I have simplified it but it is still showing error E1125
# in basic.py
import neural_network as nu
import numpy as np
import math
import neural_network as nu
def AI_connector():
Inputv = np.zeros((9), dtype=float) # input array
size=9
return nu.connector(Inputv,size)
# in neural_network.py
import numpy as np
import math
import random
Inputv = np.zeros((9), dtype=float) #input array
def connector(myList=[], *args,size):
Inputv = np.zeros((size), dtype=float) #input array
Inputv=myList
return 0
This line
def connector(myList=[], *args,size):
should be
def connector(size, myList=[], *args):
You always put the default arguments first, then the non-default ones. You put *args and **kwargs in the end.

AttributeError: 'module' object has no attribute 'cuda'

I was trying to run this repository: https://github.com/WaqasSultani/AnomalyDetectionCVPR2018
In the Test_Anomaly_Detector_public.py I am stuck with error:theano.sandbox.cuda.use('gpu0')
AttributeError: 'module' object has no attribute 'cuda'.
I am using theano as backend
This is Test_Anomaly_Detector_public.py:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
from keras.optimizers import SGD ,Adagrad
from scipy.io import loadmat, savemat
from keras.models import model_from_json
import theano.tensor as T
import theano
import csv
import ConfigParser
import collections
import time
import csv
import os
from os import listdir
import skimage.transform
from skimage import color
from os.path import isfile, join
import numpy as np
import numpy
from datetime import datetime
from scipy.spatial.distance import cdist,pdist,squareform
import theano.sandbox
import shutil
theano.sandbox.cuda.use('gpu0')
seed = 7
numpy.random.seed(seed)
def load_model(json_path): # Function to load the model
model = model_from_json(open(json_path).read())
return model
def load_weights(model, weight_path): # Function to load the model weights
dict2 = loadmat(weight_path)
dict = conv_dict(dict2)
i = 0
for layer in model.layers:
weights = dict[str(i)]
layer.set_weights(weights)
i += 1
return model
def conv_dict(dict2):
i = 0
dict = {}
for i in range(len(dict2)):
if str(i) in dict2:
if dict2[str(i)].shape == (0, 0):
dict[str(i)] = dict2[str(i)]
else:
weights = dict2[str(i)][0]
weights2 = []
for weight in weights:
if weight.shape in [(1, x) for x in range(0, 5000)]:
weights2.append(weight[0])
else:
weights2.append(weight)
dict[str(i)] = weights2
return dict
# Load Video
def load_dataset_One_Video_Features(Test_Video_Path):
VideoPath =Test_Video_Path
f = open(VideoPath, "r")
words = f.read().split()
num_feat = len(words) / 4096
# Number of features per video to be loaded. In our case num_feat=32, as we divide the video into 32 segments. Note that
# we have already computed C3D features for the whole video and divided the video features into 32 segments.
count = -1;
VideoFeatues = []
for feat in xrange(0, num_feat):
feat_row1 = np.float32(words[feat * 4096:feat * 4096 + 4096])
count = count + 1
if count == 0:
VideoFeatues = feat_row1
if count > 0:
VideoFeatues = np.vstack((VideoFeatues, feat_row1))
AllFeatures = VideoFeatues
return AllFeatures
print("Starting testing...")
AllTest_Video_Path = '/newdata/UCF_Anomaly_Dataset/Dataset/CVPR_Data/C3D_Complete_Video_txt/Test/'
# AllTest_Video_Path contains C3D features (txt file) of each video. Each file contains 32 features, each of 4096 dimensions.
Results_Path = '../Eval_Res/'
# Results_Path is the folder where you can save your results
Model_dir='../Trained_AnomalyModel/'
# Model_dir is the folder where we have placed our trained weights
weights_path = Model_dir + 'weights_L1L2.mat'
# weights_path is Trained model weights
model_path = Model_dir + 'model.json'
if not os.path.exists(Results_Path):
os.makedirs(Results_Path)
All_Test_files= listdir(AllTest_Video_Path)
All_Test_files.sort()
model=load_model(model_path)
load_weights(model, weights_path)
nVideos=len(All_Test_files)
time_before = datetime.now()
for iv in range(nVideos):
Test_Video_Path = os.path.join(AllTest_Video_Path, All_Test_files[iv])
inputs=load_dataset_One_Video_Features(Test_Video_Path) # 32 segments features for one testing video
predictions = model.predict_on_batch(inputs) # Get anomaly prediction for each of 32 video segments.
aa=All_Test_files[iv]
aa=aa[0:-4]
A_predictions_path = Results_Path + aa + '.mat' # Save array of 1*32, containing anomaly score for each segment. Please see Evaluate Anomaly Detector to compute ROC.
print "Total Time took: " + str(datetime.now() - time_before)
My .theanorc file:
[global]
floatX = float32
device = cuda0
[gpuarray]
preallocate = 1
You can comment out this line. When you run please follow this
THEANO_FLAGS=mode=FAST_RUN,device=cuda0,floatX=float32 python [...]

Computing precision and recall for two sets of keywords in NLTK and Scikit for sets of different sizes

I am trying to compute precision and recall for two sets of keywords. The gold_standard has 823 terms and the test has 1497 terms.
Using nltk.metrics's version of precision and recall, I am able to provide the two sets just fine. But doing the same for Scikit is throwing me an error:
ValueError: Found arrays with inconsistent numbers of samples: [ 823 1497]
How do I resolve this?
#!/usr/bin/python3
from nltk.metrics import precision, recall
from sklearn.metrics import precision_score
from sys import argv
from time import time
import numpy
import csv
def readCSVFile(filename):
termList = set()
with open(filename, 'rt', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
termList.update(row)
return termList
def readDocuments(gs_file, fileToProcess):
print("Reading CSV files...")
gold_standard = readCSVFile(gs_file)
test = readCSVFile(fileToProcess)
print("All files successfully read!")
return gold_standard, test
def calcPrecisionScipy(gs, test):
gs = numpy.array(list(gs))
test = numpy.array(list(test))
print("Precision Scipy: ",precision_score(gs, test, average=None))
def process(datasest):
print("Processing input...")
gs, test = dataset
print("Precision: ", precision(gs, test))
calcPrecisionScipy(gs, test)
def usage():
print("Usage: python3 generate_stats.py gold_standard.csv termlist_to_process.csv")
if __name__ == '__main__':
if len(argv) != 3:
usage()
exit(-1)
t0 = time()
process(readDocuments(argv[1], argv[2]))
print("Total runtime: %0.3fs" % (time() - t0))
I referred to the following pages for coding:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score
=================================Update===================================
Okay, so I tried to add 'non-sensical' data to the list to make them equal length:
def calcPrecisionScipy(gs, test):
if len(gs) < len(test):
gs.update(list(range(len(test)-len(gs))))
gs = numpy.array(list(gs))
test = numpy.array(list(test))
print("Precision Scipy: ",precision_score(gs, test, average=None))
Now I have another error:
UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
seems scientifically not possible to compute precision or recall of two sets of different lengths.
I guess what nltk must do is to truncate the sets to the same lengths, you can do the same in your script.
import numpy as np
import sklearn.metrics
set1 = [True,True]
set2 = [True,False,False]
length = np.amin([len(set1),len(set2)])
set1 = set1[:length]
set2 = set2[:length]
print sklearn.metrics.precision_score(set1,set2))

random forest with spark: get predicted values and R²

I am using MLlib of spark to perform a regression random forest.
I am using the python code here:
https://spark.apache.org/docs/1.2.0/mllib-ensembles.html#tab_python_1
It works but now I would like to get the predicted values as well as the R or R² of the prediction model.
How to get that?
Here is how to save a csv file into RDD (spark data format):
# Imports
import csv
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from collections import namedtuple
from operator import add, itemgetter
from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
import shutil
import numpy
def parse(row):
"""
Parses a row and returns a named tuple.
"""
row[0] = str(row[0])
row[1] = float(row[1])
row[2] = float(row[2])
row[3] = float(row[3])
row[4] = float(row[4])
return LabeledPoint(row[4], row[:4])
def split(line):
"""
Operator function for splitting a line with csv module
"""
reader = csv.reader(StringIO(line), delimiter=';')
return next(reader)
#save csv file on a spark cluster (RDD format)
data = sc.textFile("datafile").map(split).map(parse)
Here is how to perform the random forest algorithm and how to get the predicted values:
def random_forest_regression(data):
"""
Run the random forest (regression) algorithm on the data to perform the prediction
"""
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32)
#increase number of trees to have a better prediction
# Evaluate model on TEST instances and compute test error
predictions_test = model.predict(testData.map(lambda x: x.features))
real_and_predicted_test = testData.map(lambda lp: lp.label).zip(predictions_test)
#get the list of real and predicted values FOR ALL THE POINTS
predictions = model.predict(data.map(lambda x: x.features))
real_and_predicted = data.map(lambda lp: lp.label).zip(predictions)
real_and_predicted=real_and_predicted.collect()
print("real and predicted values")
for value in real_and_predicted:
print(value)
return model, real_and_predicted
To get the correlation coefficient (R value), I used numpy:
def compute_correlation_coefficient(real_and_predicted):
"""
compute and display the correlation coefficient from a list of real and predicted values
"""
list1=[]
list2=[]
for tuple in real_and_predicted:
list1.append(tuple[0])
list2.append(tuple[1])
print("correlation coefficient")
print(numpy.corrcoef(list1, list2)[0, 1])
To get the R², take the square value of the correlation coefficient.
Voilà !

Resources