BOW Chunking dataset - scikit-learn

I am attempting chunk my data set while using countvectorizer and TfidfVectorizer. Here is what I have so far.
def BOW(data):
df_temp = data.copy(deep = True)
df_temp = basic_preprocessing(df_temp)
#number of words in each chunk
num_words = 2000
chunks = []
counter = 0
df_temp = splitter(df_temp['text'], num_words)
for text in df_temp:
chunk = {'index': counter, 'text': text}
chunks.append(chunk)
counter += 1
count_vectorizer = CountVectorizer(max_df = 0.85)
count_vectorizer.fit([chunk['text'] for chunk in chunks])
list_corpus = df_temp["text"].tolist()
list_labels = df_temp["class_label"].tolist()
X = count_vectorizer.transform(list_corpus)
return X, list_labels
Not sure if this is correct or not. Here are the links to where I tried to reproduce.
How to avoid memory overloads using SciKit Learn

Related

why I have output like this for make pair in siamese network?

This is my code, and I take the output that illustrates the below. Len of idx is 1000. I want to make image pairs and label pairs, but I take errors like this:
Code:
pair_images = []
pair_labels = []
new_labels = []
for k in labels:
new_labels.append(int(k))
numClasses = len(np.unique(new_labels))
new_labels = np.array(new_labels)
idx = [np.where(new_labels == i)[0] for i in range(0,numClasses)]
for idxA in range(len(images)):
# Make Posetive Images
currentImage = images[idxA]
label = new_labels[idxA]
idxB = np.random.choice(idx[label])
posImage = images[idxB]
output:
idxB = np.random.choice(idx[label])
IndexError: list index out of range

How can I speed up using Pytorch DataLoader?

I had a dataset including about a million of rows. Before, I read the rows, preprocessed data and created a list of rows to be trained. Then I defined a Dataloader over this data like:
train_dataloader = torch.utils.data.DataLoader(mydata['train'],
batch_size=node_batch_size,shuffle=shuffle,collate_fn=data_collator)
Preprocessing could be time consuming, so I thought to define an IterableDataSet with __iter__ function. Then I could define my Dataloader like:
train_dataloader = torch.utils.data.DataLoader(myds['train'],
batch_size=node_batch_size,shuffle=shuffle,collate_fn=data_collator)
However, still to begin training it seems that it calls my preprocessing function and creates an Iteration over it. So, it seems I didn't gain much speed up.
Please guide me how could I use speed up in this case?
Here is my part of my class:
def __iter__(self):
iter_start = self.start
iter_end = self.num_samples
worker_info = torch.utils.data.get_worker_info()
if worker_info is None: # single-process data loading, return the full iterator
iter_start = self.start
iter_end = self.num_samples
else: # in a worker process
# split workload
per_worker = int(math.ceil((self.num_samples - self.start) / float(worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.num_samples)
if self.flat_data:
return iter(self.flat_data)
else:
return iter(self.fill_data(iter_start, iter_end))
def fill_data(self, iter_start, iter_end, show_progress=False):
flat_data = []
if iter_end < 0:
iter_end = self.num_samples
kk = 0
dlog.info("========================== SPLIT: %s", self.split_name)
dlog.info("get data from %s to %s", iter_start, iter_end)
dlog.info("total rows: %s", len(self.split_df))
if show_progress:
pbar = tqdm(total = self.num_samples)
for index, d in self.split_df.iterrows():
if kk < iter_start:
dlog.info("!!!!!!!!! before start %s", iter_start)
kk += 1
continue
rel = d["prefix"]
...
# preprocessing and adding to returned list
I did preprosessing in the fill_data or __iter__ body. However, I can use a map for preprocessing. Then the preprocessing is called during training and for every batch and not before training.
import pandas as pd
import torch
class MyDataset(torch.utils.data.IterableDataset):
def __init__(self, fname, until=10):
self.df = pd.read_table("atomic/" + fname)
self.until = until
def preproc(self, t):
prefix, data = t
text = "Preproc: " + prefix + "|" + data
print(text) # to check when it is called
return text
def __iter__(self):
_iter = self.df_iter()
return map(self.preproc, _iter)
def df_iter(self):
ret = []
for idx, row in self.df.iterrows():
ret.append((row["prefix"],row["input_text"]))
return iter(ret)

keras BatchGenerator(keras.utils.Sequence) is too slow

I'm using a custom batch generator with large dataframe. but the Generator takes too much time to generate a batch, it takes 127s to generate a batch of 1024. I've tried Dask but still, the processing is slow. is there any way to integrate multiprocessing with inside the generator. knowing that I've tried use_multiprocessing=True with workers=12
import keras
from random import randint
import glob
import warnings
import numpy as np
import math
import pandas as pd
import dask.dataframe as dd
class BatchGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, labels=None, batch_size=8, n_classes=4, shuffle=True,
seq_len=6, data_path=None, meta_path=None,list_IDs=None):
'Initialization'
self.batch_size = batch_size
self.labels = labels
self.n_classes = n_classes
self.shuffle = shuffle
self.seq_len = seq_len
self.meta_df = meta_path
self.data_df = data_path
self.data_df = self.data_df.astype({"mjd": int})
self.list_IDs = list_IDs
if self.list_IDs==None:
self.list_IDs = list(self.meta_df['object_id'].unique())
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
X_dat = np.zeros((self.batch_size, self.seq_len,6,1))
Y_mask = np.zeros((self.batch_size, self.seq_len,6,1))
# Y_dat = np.empty((self.batch_size,1), dtype=int)
X_length= np.empty((self.batch_size,1), dtype=int)
for i, trans_id in enumerate(list_IDs_temp):
curve = self.data_df[self.data_df.object_id==trans_id]
mjdlist = list(curve['mjd'].unique())
ts_length = len(mjdlist)
if ts_length <= self.seq_len :
start_ind = 0
else :
start_ind = randint(0, ts_length - self.seq_len)
ts_length = self.seq_len
for j in range(ts_length):
if j+start_ind < len(mjdlist):
step = curve[curve.mjd==mjdlist[j+start_ind]]
for k in range(len(step.mjd)):
obs = step[step.passband==k]
if len(obs) == 0 :
# print('here is one')
continue
else:
if k == 0:
X_dat[i,j,0,0] =obs.flux.iloc[0]
Y_mask[i,j,0,0] = 1
if k == 1:
X_dat[i,j,1,0] = obs.flux.iloc[0]
Y_mask[i,j,1,0] = 1
if k == 2:
X_dat[i,j,2,0] = obs.flux.iloc[0]
Y_mask[i,j,2,0] = 1
if k == 3:
X_dat[i,j,3,0] = obs.flux.iloc[0]
Y_mask[i,j,3,0] = 1
if k == 4:
X_dat[i,j,4,0] = obs.flux.iloc[0]
Y_mask[i,j,4,0] = 1
if k == 5:
X_dat[i,j,5,0] = obs.flux.iloc[0]
Y_mask[i,j,5,0] = 1
# meta = self.meta_df[self.meta_df['object_id'] == trans_id]
# Y_dat[i] = self.labels[int(meta['target'])]
X_length[i,0] = ts_length
flux_max = np.max(X_dat[i])
flux_min = np.min(X_dat[i])
flux_pow = math.log2(flux_max - flux_min)
X_dat[i] /= flux_pow
X_noised = X_dat + np.random.uniform(low=0, high=0.5, size=X_dat.shape)
return [X_noised, X_length, np.reshape(Y_mask,(self.batch_size, self.seq_len*6))], np.reshape(X_dat,(self.batch_size, self.seq_len*6))
To make it faster, the for loop in the function __data_generation should be parallelized. Using the joblib package may help.

Problem with LSH implementation from Datasketch (size of input data > 150000)

I am a beginner data scientist, trying to write fast-duplicate search using LSH implementation from datasketch. When I run my program with input text with big size(number of docs > 250000), step 1 is fine, but then program hangs on step 2. When I run program with small input, everything works fine. Is there any decision how to fix this problem?
def LSH(data, num_perm = 128, threshold = 0.5, check_const = 0.9):
vec_unig = CountVectorizer(min_df=50, analyzer = 'word', stop_words = ['_dot_', '_comma_''_voskl_'], ngram_range=(1,2))
X = vec_unig.fit_transform([" ".join(i) for i in data])
length = X.shape[0]
array1 = []
print("Collection:" ,length)
print("Step 1:")
print("Form Minhash")
start = datetime.now()
for i in range(len(data)):
print(i)
m = MinHash(num_perm = num_perm)
for d in data[i]:
m.update(d.encode('utf8'))
array1.append(m)
print(datetime.now()- start)
print("Step 2")
print("Form potential clusters")
start = datetime.now()
lsh = MinHashLSH(threshold = threshold, num_perm = num_perm)
for i in range(len(array1)):
if ((i % 100) == 0):
print(i)
lsh.insert(i, array1[i])
print(datetime.now()- start)

file name train_tripletloss.py showing IndexError: index 2 is out of bounds for axis 0 with size 2

I have a dataset with only 2 classes, I was trying to train the dataset using the train_tripletloss.py as per https://github.com/davidsandberg/facenet/wiki/Triplet-loss-training
Please help me to solve it...
the part of code where error occurred is shown below:
def sample_people(dataset, people_per_batch, images_per_person):
nrof_images = people_per_batch * images_per_person
# Sample classes from the dataset
nrof_classes = len(dataset)
class_indices = np.arange(nrof_classes)
np.random.shuffle(class_indices)
i = 0
image_paths = []
num_per_class = []
sampled_class_indices = []
# Sample images from these classes until we have enough
while len(image_paths)<nrof_images:
class_index = class_indices[i]
nrof_images_in_class = len(dataset[class_index])
image_indices = np.arange(nrof_images_in_class)
np.random.shuffle(image_indices)
nrof_images_from_class = min(nrof_images_in_class, images_per_person, nrof_images-len(image_paths))
idx = image_indices[0:nrof_images_from_class]
image_paths_for_class = [dataset[class_index].image_paths[j] for j in idx]
sampled_class_indices += [class_index]*nrof_images_from_class
image_paths += image_paths_for_class
num_per_class.append(nrof_images_from_class)
i+=1
return image_paths, num_per_class
I think the error occurred at class_index = class_indices[i]

Resources