Pad data using tf.data.Dataset - python-3.x

I have to use tf.data.Dataset for creating a input pipeline for an RNN model in tensorflow. I am providing a basic code, by which I need to pad the data in batch with a pad token and use it for further manipulation.
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen,(tf.int32,tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
print(sess.run(iter2.get_next()))
I hope the code is self explanatory with comments. But I am getting following error,
InvalidArgumentError (see above for traceback): Cannot batch tensors with different shapes in component 0. First element had shape [11] and element 1 had shape [12].
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,?], [?]], output_types=[DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]

I believe that since your generator yields two outputs, your padded_shapes and padded_values tuples must have a length of two. For me, this works:
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111), tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)

Finally got the answer. The issue was for the second padded shapes instead of Tensorshape([None]), we should provide [], because the second item returned by the generator is a scalar. If using Tensorshape([None]),, make sure we are returning a vector
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), (tf.TensorShape([None]), []))
padded_shapes = (tf.TensorShape([None]), []) # sentence of unknown size
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=(-111, 0))
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
sess.run(iter2.get_next())

Related

Identifying the contents of training and testing dataset after using train_test_split()

I am doing multi-class classification using ML. After preprocessing the data, I am using train_test_split function to divide the data into training and testing dataset. Is there a way to know how many samples from each class are present in the training and testing dataset? For example:
Class
No. of Training Samples
No. of Testing Samples
a
30
5
b
20
10
c
25
5
My Code:
classes = ['a','b','c']
def pp():
data_list=[]
for index,label in enumerate(classes):
class_list=[]
if label=='silence':
silence_path = os.path.join(C["dire"],'silence')
if not os.path.exists(silence_path):
os.mkdir(silence_path)
silence_stride = 2000
#sample_rate = 16000
folder = os.path.join(C["dire"],'_background_noise_')
for file_ in os.listdir(folder):
if '.wav' in file_:
load_path = os.path.join(folder,file_)
sample_rate,y = wavfile.read(load_path)
for i in range(0,len(y)-sample_rate,silence_stride):
file_path = "silence/{}_{}.wav".format(file_[:-4],i)
y_slice = y[i:i+sample_rate]
wavfile.write(os.path.join(C["dire"],file_path),sample_rate,y_slice)
class_list.append(file_path)
else:
folder = os.path.join(C["dire"],label)
for file_ in os.listdir(folder):
file_path = '{}/{}'.format(label,file_)
class_list.append(file_path)
random.shuffle(class_list)
data_list.append(class_list)
X = []
Y = []
preemphasis = 0.985
print("Feature Extraction Started")
for i,class_list in enumerate(data_list):
for j,samples in enumerate(class_list):
if(samples.endswith('.wav')):
sample_rate,audio = wavfile.read(os.path.join(C["dire"],samples))
if(audio.size<sample_rate):
audio = np.pad(audio,(sample_rate-audio.size,0),mode="constant")
coeff = mfccforconfidence.mfcc(audio,sample_rate,preemphasis)
X.append(coeff)
#print(X)
if(samples.split('/')[0] in classes):
Y.append(samples.split('/')[0])
elif(samples.split('/')[0]=='_background_noise_'):
Y.append('silence')
A = np.zeros((len(X),X[0].shape[0],X[0][0].shape[0]),dtype='object')
for i in range(0,len(X)):
A[i] = np.array(X[i]) #Converting list X into array A
# print(A.shape)
end1 = time.time()
print("Time taken for feature extraction:{}sec".format(end1-start))
MLB = MultiLabelBinarizer() # one hot encoding for converting labels into binary form
MLB.fit(pd.Series(Y).fillna("missing").str.split(', '))
Y_MLB = MLB.transform(pd.Series(Y).fillna("missing").str.split(', '))
MLB.classes_ #Same like classes array
print(Y_MLB.shape)
Y = Y_MLB
X = tf.keras.utils.normalize(X)
X_train,X_valtest,Y_train,Y_valtest = train_test_split(X,Y,test_size=0.2,random_state=37)
X_val,X_test,Y_val,Y_test = train_test_split(X_valtest,Y_valtest,test_size=0.5,random_state=37)
print(X_train.shape,X_val.shape,X_test.shape,Y_train.shape,Y_val.shape,Y_test.shape)
So, basically I am using ML for audio classification. After extracting the features, I divide the data into training and testing dataset.
I hope that this piece of code will be useful to answer the question.
If you have a "3D numpy array", here's a demonstration of one way you could do it.
import numpy as np
from random import randint,choices
# Create some data
my_data = np.array(list(zip(
(randint(0,100) for _ in range(100)),
(choices(["a","b","c"], k=100)),
(randint(0,100) for _ in range(100))
))
)
# Show the first 5 elements
print(my_data[0:5,:])
# [['69' 'a' '38']
# ['18' 'c' '73']
# ['57' 'a' '50']
# ['35' 'a' '60']
# ['52' 'b' '1']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(my_data[:,[0,1]], my_data[:,2])
from collections import Counter
print(Counter(X_train[:,1]))
# Counter({'c': 31, 'b': 26, 'a': 18})
print(Counter(X_train[:,1])["a"])
# 18
print(Counter(X_test[:,1]))
# Counter({'b': 12, 'c': 7, 'a': 6})

using a list as a value in pandas.DataFeame and tensorFlow

Im tring to use list as a value in pandas.DataFrame
but Im getting Exception when trying to use use the adapt function in on the Normalization layer with the NumPy array
this is the error:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
and this is the code:
import pandas as pd
import numpy as np
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow.keras import layers
data = [[45.975, 45.81, 45.715, 45.52, 45.62, 45.65, 4],
[55.67, 55.975, 55.97, 56.27, 56.23, 56.275, 5],
[86.87, 86.925, 86.85, 85.78, 86.165, 86.165, 3],
[64.3, 64.27, 64.285, 64.29, 64.325, 64.245, 6],
[35.655, 35.735, 35.66, 35.69, 35.665, 35.63, 5]
]
lables = [0, 1, 0, 1, 1]
def do():
d_1 = None
for l, d in zip(lables, data):
if d_1 is None:
d_1 = pd.DataFrame({'lable': l, 'close_price': [d]})
else:
d_1 = d_1.append({'lable': l, 'close_price': d}, ignore_index=True)
dataset = d_1.copy()
print(dataset.isna().sum())
dataset = dataset.dropna()
print(dataset.keys())
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print(train_dataset.describe().transpose())
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('lable')
test_labels = test_features.pop('lable')
print(train_dataset.describe().transpose()[['mean', 'std']])
normalizer = tf.keras.layers.Normalization(axis=-1)
ar = np.array(train_features)
normalizer.adapt(ar)
print(normalizer.mean.numpy())
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
diraction = np.array(train_features)
diraction_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
diraction_normalizer.adapt(diraction)
diraction_model = tf.keras.Sequential([
diraction_normalizer,
layers.Dense(units=1)
])
print(diraction_model.summary())
print(diraction_model.predict(diraction[:10]))
diraction_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error')
print(train_features['close_price'])
history = diraction_model.fit(
train_features['close_price'],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2)
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
test_results = {}
test_results['diraction_model'] = diraction_model.evaluate(
test_features,
test_labels, verbose=0)
x = tf.linspace(0.0, 250, 251)
y = diraction_model.predict(x)
print("end")
def main():
do()
if __name__ == "__main__":
main()
I think it is not the usual practice to shrink your features into one column.
Quick-fix is you may put the following line
train_features = np.array(train_features['close_price'].to_list())
before
normalizer = tf.keras.layers.Normalization(axis=-1)
to get rid of the error, but now because your train_features has changed from a DataFrame into a np.array, your subsequent code may suffer, so you need to take care of that too.
If I were you, however, I would have constructed the DataFrame this way
df = pd.DataFrame(data)
df['label'] = lables
Please consider.

Predicting classes in MNIST dataset with a Gaussian- the same prediction errors with different paramemters?

I am trying to find the best c parameter following the instructions to a task that asks me to ' Define a function, fit_generative_model, that takes as input a training set (train_data, train_labels) and fits a Gaussian generative model to it. It should return the parameters of this generative model; for each label j = 0,1,...,9, where
pi[j]: the frequency of that label
mu[j]: the 784-dimensional mean vector
sigma[j]: the 784x784 covariance matrix
It is important to regularize these matrices. The standard way of doing this is to add cI to them, where c is some constant and I is the 784-dimensional identity matrix. c is now a parameter, and by setting it appropriately, we can improve the performance of the model.
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import gzip, os
import numpy as np
from scipy.stats import multivariate_normal
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# Downloads the dataset
def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
print("Downloading %s" % filename)
urlretrieve(source + filename, filename)
# Invokes download() if necessary, then reads in images
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1,784)
return data
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
return data
## Load the training set
train_data = load_mnist_images('train-images-idx3-ubyte.gz')
train_labels = load_mnist_labels('train-labels-idx1-ubyte.gz')
## Load the testing set
test_data = load_mnist_images('t10k-images-idx3-ubyte.gz')
test_labels = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
train_data.shape, train_labels.shape
So I have written this code for three different C-values. they each give me the same error?
def fit_generative_model(x,y):
lst=[]
for c in [20,200, 4000]:
k = 10 # labels 0,1,...,k-1
d = (x.shape)[1] # number of features
mu = np.zeros((k,d))
sigma = np.zeros((k,d,d))
pi = np.zeros(k)
for label in range(0,k):
indices = (y == label)
mu[label] = np.mean(x[indices,:], axis=0)
sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1) + c*np.identity(784) # I define the identity matrix
predictions = np.argmax(score, axis=1)
errors = np.sum(predictions != y)
lst.append(errors)
print(c,"Model makes " + str(errors) + " errors out of 10000", lst)
Then I fit it to the training data and get these same errors:
mu, sigma, pi = fit_generative_model(train_data, train_labels)
20 Model makes 1 errors out of 10000 [1]
200 Model makes 1 errors out of 10000 [1, 1]
4000 Model makes 1 errors out of 10000 [1, 1, 1]
and to the test data:
mu, sigma, pi = fit_generative_model(test_data, test_labels)
20 Model makes 9020 errors out of 10000 [9020]
200 Model makes 9020 errors out of 10000 [9020, 9020]
4000 Model makes 9020 errors out of 10000 [9020, 9020, 9020]
What is it I'm doing wrong? the correct answer is c=4000 which yields an error of ~4.3%.

How to map a function to a pre-batched ('BatchDataset') tensor in Tensorflow

I am making data windows (input, output pairs of windows) from time series data. I have already converted my time series to a tf dataset, where each batch has the number of time steps equal to the total window sizes I need.
def make_dataset(data=train_df[0]):
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32
)
return ds
Example of the shape returned:
for example in tensor.take(1):
print(f'shape: {example.shape}')
shape: (32, 48, 18)
What I need to do now is split the time dimension into my input-output pairs, and I have a function to do this, however, when I try to map this function to my 'ds' in the above function I get the following error:
'BatchDataset' object is not subscriptable
I am hoping someone can help me understand where I am going wrong? I am pretty new to tensorflow... My code is below, in this example 'input slice' and 'label_slice' are 0 and 24 respectively. So my aim is to split my batches into input-output pairs of length 24 each.
def split_window(features):
inputs = features[:, input_slice, :]
labels = features[:, labels_slice, :]
inputs.set_shape([None, input_width, None])
labels.set_shape([None, label_width, None])
return inputs, labels
def make_dataset(data=train_df[0]):
data = np.array(data, dtype=np.float32)
ds = tf.keras.preprocessing.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=32
)
ds = ds.map(split_window(ds))
return ds
tensor = make_dataset()
tensor
'BatchDataset' object is not subscriptable
Your snippet of code looks similar to the tutorial of Time Series in Tensorflow. Based on that, I modified the main class WindowGenerator() (excluded the parts of train/val/test datasets and output-labels selection) to a simpler class suitable to your question.
class WindowGenerator():
def __init__(self, input_width, label_width, shift):
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size [self.labels_slice]
def split_window(self, features):
inputs = features[:, self.input_slice, :]
labels = features[:, self.labels_slice, :]
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, None])
return inputs, labels
def make_dataset(self, data):
data = np.array(data, dtype=np.float32)
ds = tf.keras.utils.timeseries_dataset_from_array(
data=data,
targets=None,
sequence_length=self.total_window_size,
sequence_stride=1,
shuffle=True,
batch_size=batch_size,)
ds = ds.map(self.split_window)
return ds
input_width=24
label_width=24
total_windth = input_width + label_width
batch_size = 32
window = WindowGenerator(input_width=input_width, label_width=label_width, shift=1)
dataset = window.make_dataset(train_df[0])
I would recommend, though, to use Dataset.window(). It is simpler and more intuitive.
dataset = tf.data.Dataset.from_tensor_slices(train_df[0])
dataset = dataset.window(total_windth, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(batch_size))
dataset = dataset.map(lambda window: (window[:-label_width], window[-input_width:]))

Padding sequences in tensorflow with tf.pad

I am trying to load imdb dataset in python. I want to pad the sequences so that each sequence is of same length. I am currently doing it with numpy. What is a good way to do it in tensorflow with tf.pad. I saw the given here but I dont know how to apply it with a 2 d matrix.
Here is my current code
import tensorflow as tf
from keras.datasets import imdb
max_features = 5000
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
def padSequence(dataset,max_length):
dataset_p = []
for x in dataset:
if(len(x) <=max_length):
dataset_p.append(np.pad(x,pad_width=(0,max_length-len(x)),mode='constant',constant_values=0))
else:
dataset_p.append(x[0:max_length])
return np.array(x_train_p)
max_length = max(len(x) for x in x_train)
x_train_p = padSequence(x_train,max_length)
x_test_p = padSequence(x_test,max_length)
print("input x shape: " ,x_train_p.shape)
Can someone please help ?
I am using tensorflow 1.0
In Response to the comment:
The padding dimensions are given by
# 'paddings' is [[1, 1,], [2, 2]].
I have a 2 d matrix where every row is of different length. I want to be able to pad to to make them of equal length. In my padSequence(dataset,max_length) function, I get the length of every row with len(x) function. Should I just do the same with tf ? Or is there a way to do it like Keras Function
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
If you want to use tf.pad, according to me you have to iterate for each row.
Code will be something like this:
max_length = 250
number_of_samples = 5
padded_data = np.ndarray(shape=[number_of_samples, max_length],dtype=np.int32)
sess = tf.InteractiveSession()
for i in range(number_of_samples):
reviewToBePadded = dataSet[i] #dataSet numpy array
paddings = [[0,0], [0, maxLength-len(reviewToBePadded)]]
data_tf = tf.convert_to_tensor(reviewToBePadded,tf.int32)
data_tf = tf.reshape(data_tf,[1,len(reviewToBePadded)])
data_tf = tf.pad(data_tf, paddings, 'CONSTANT')
padded_data[i] = data_tf.eval()
print(padded_data)
sess.close()
New to Python, possibly not the best code. But I just want to explain the concept.

Resources