Identifying the contents of training and testing dataset after using train_test_split() - scikit-learn

I am doing multi-class classification using ML. After preprocessing the data, I am using train_test_split function to divide the data into training and testing dataset. Is there a way to know how many samples from each class are present in the training and testing dataset? For example:
Class
No. of Training Samples
No. of Testing Samples
a
30
5
b
20
10
c
25
5
My Code:
classes = ['a','b','c']
def pp():
data_list=[]
for index,label in enumerate(classes):
class_list=[]
if label=='silence':
silence_path = os.path.join(C["dire"],'silence')
if not os.path.exists(silence_path):
os.mkdir(silence_path)
silence_stride = 2000
#sample_rate = 16000
folder = os.path.join(C["dire"],'_background_noise_')
for file_ in os.listdir(folder):
if '.wav' in file_:
load_path = os.path.join(folder,file_)
sample_rate,y = wavfile.read(load_path)
for i in range(0,len(y)-sample_rate,silence_stride):
file_path = "silence/{}_{}.wav".format(file_[:-4],i)
y_slice = y[i:i+sample_rate]
wavfile.write(os.path.join(C["dire"],file_path),sample_rate,y_slice)
class_list.append(file_path)
else:
folder = os.path.join(C["dire"],label)
for file_ in os.listdir(folder):
file_path = '{}/{}'.format(label,file_)
class_list.append(file_path)
random.shuffle(class_list)
data_list.append(class_list)
X = []
Y = []
preemphasis = 0.985
print("Feature Extraction Started")
for i,class_list in enumerate(data_list):
for j,samples in enumerate(class_list):
if(samples.endswith('.wav')):
sample_rate,audio = wavfile.read(os.path.join(C["dire"],samples))
if(audio.size<sample_rate):
audio = np.pad(audio,(sample_rate-audio.size,0),mode="constant")
coeff = mfccforconfidence.mfcc(audio,sample_rate,preemphasis)
X.append(coeff)
#print(X)
if(samples.split('/')[0] in classes):
Y.append(samples.split('/')[0])
elif(samples.split('/')[0]=='_background_noise_'):
Y.append('silence')
A = np.zeros((len(X),X[0].shape[0],X[0][0].shape[0]),dtype='object')
for i in range(0,len(X)):
A[i] = np.array(X[i]) #Converting list X into array A
# print(A.shape)
end1 = time.time()
print("Time taken for feature extraction:{}sec".format(end1-start))
MLB = MultiLabelBinarizer() # one hot encoding for converting labels into binary form
MLB.fit(pd.Series(Y).fillna("missing").str.split(', '))
Y_MLB = MLB.transform(pd.Series(Y).fillna("missing").str.split(', '))
MLB.classes_ #Same like classes array
print(Y_MLB.shape)
Y = Y_MLB
X = tf.keras.utils.normalize(X)
X_train,X_valtest,Y_train,Y_valtest = train_test_split(X,Y,test_size=0.2,random_state=37)
X_val,X_test,Y_val,Y_test = train_test_split(X_valtest,Y_valtest,test_size=0.5,random_state=37)
print(X_train.shape,X_val.shape,X_test.shape,Y_train.shape,Y_val.shape,Y_test.shape)
So, basically I am using ML for audio classification. After extracting the features, I divide the data into training and testing dataset.
I hope that this piece of code will be useful to answer the question.

If you have a "3D numpy array", here's a demonstration of one way you could do it.
import numpy as np
from random import randint,choices
# Create some data
my_data = np.array(list(zip(
(randint(0,100) for _ in range(100)),
(choices(["a","b","c"], k=100)),
(randint(0,100) for _ in range(100))
))
)
# Show the first 5 elements
print(my_data[0:5,:])
# [['69' 'a' '38']
# ['18' 'c' '73']
# ['57' 'a' '50']
# ['35' 'a' '60']
# ['52' 'b' '1']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(my_data[:,[0,1]], my_data[:,2])
from collections import Counter
print(Counter(X_train[:,1]))
# Counter({'c': 31, 'b': 26, 'a': 18})
print(Counter(X_train[:,1])["a"])
# 18
print(Counter(X_test[:,1]))
# Counter({'b': 12, 'c': 7, 'a': 6})

Related

using a list as a value in pandas.DataFeame and tensorFlow

Im tring to use list as a value in pandas.DataFrame
but Im getting Exception when trying to use use the adapt function in on the Normalization layer with the NumPy array
this is the error:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
and this is the code:
import pandas as pd
import numpy as np
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow.keras import layers
data = [[45.975, 45.81, 45.715, 45.52, 45.62, 45.65, 4],
[55.67, 55.975, 55.97, 56.27, 56.23, 56.275, 5],
[86.87, 86.925, 86.85, 85.78, 86.165, 86.165, 3],
[64.3, 64.27, 64.285, 64.29, 64.325, 64.245, 6],
[35.655, 35.735, 35.66, 35.69, 35.665, 35.63, 5]
]
lables = [0, 1, 0, 1, 1]
def do():
d_1 = None
for l, d in zip(lables, data):
if d_1 is None:
d_1 = pd.DataFrame({'lable': l, 'close_price': [d]})
else:
d_1 = d_1.append({'lable': l, 'close_price': d}, ignore_index=True)
dataset = d_1.copy()
print(dataset.isna().sum())
dataset = dataset.dropna()
print(dataset.keys())
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print(train_dataset.describe().transpose())
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('lable')
test_labels = test_features.pop('lable')
print(train_dataset.describe().transpose()[['mean', 'std']])
normalizer = tf.keras.layers.Normalization(axis=-1)
ar = np.array(train_features)
normalizer.adapt(ar)
print(normalizer.mean.numpy())
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
diraction = np.array(train_features)
diraction_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
diraction_normalizer.adapt(diraction)
diraction_model = tf.keras.Sequential([
diraction_normalizer,
layers.Dense(units=1)
])
print(diraction_model.summary())
print(diraction_model.predict(diraction[:10]))
diraction_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error')
print(train_features['close_price'])
history = diraction_model.fit(
train_features['close_price'],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2)
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
test_results = {}
test_results['diraction_model'] = diraction_model.evaluate(
test_features,
test_labels, verbose=0)
x = tf.linspace(0.0, 250, 251)
y = diraction_model.predict(x)
print("end")
def main():
do()
if __name__ == "__main__":
main()
I think it is not the usual practice to shrink your features into one column.
Quick-fix is you may put the following line
train_features = np.array(train_features['close_price'].to_list())
before
normalizer = tf.keras.layers.Normalization(axis=-1)
to get rid of the error, but now because your train_features has changed from a DataFrame into a np.array, your subsequent code may suffer, so you need to take care of that too.
If I were you, however, I would have constructed the DataFrame this way
df = pd.DataFrame(data)
df['label'] = lables
Please consider.

Predicting classes in MNIST dataset with a Gaussian- the same prediction errors with different paramemters?

I am trying to find the best c parameter following the instructions to a task that asks me to ' Define a function, fit_generative_model, that takes as input a training set (train_data, train_labels) and fits a Gaussian generative model to it. It should return the parameters of this generative model; for each label j = 0,1,...,9, where
pi[j]: the frequency of that label
mu[j]: the 784-dimensional mean vector
sigma[j]: the 784x784 covariance matrix
It is important to regularize these matrices. The standard way of doing this is to add cI to them, where c is some constant and I is the 784-dimensional identity matrix. c is now a parameter, and by setting it appropriately, we can improve the performance of the model.
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import gzip, os
import numpy as np
from scipy.stats import multivariate_normal
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# Downloads the dataset
def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
print("Downloading %s" % filename)
urlretrieve(source + filename, filename)
# Invokes download() if necessary, then reads in images
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1,784)
return data
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
return data
## Load the training set
train_data = load_mnist_images('train-images-idx3-ubyte.gz')
train_labels = load_mnist_labels('train-labels-idx1-ubyte.gz')
## Load the testing set
test_data = load_mnist_images('t10k-images-idx3-ubyte.gz')
test_labels = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
train_data.shape, train_labels.shape
So I have written this code for three different C-values. they each give me the same error?
def fit_generative_model(x,y):
lst=[]
for c in [20,200, 4000]:
k = 10 # labels 0,1,...,k-1
d = (x.shape)[1] # number of features
mu = np.zeros((k,d))
sigma = np.zeros((k,d,d))
pi = np.zeros(k)
for label in range(0,k):
indices = (y == label)
mu[label] = np.mean(x[indices,:], axis=0)
sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1) + c*np.identity(784) # I define the identity matrix
predictions = np.argmax(score, axis=1)
errors = np.sum(predictions != y)
lst.append(errors)
print(c,"Model makes " + str(errors) + " errors out of 10000", lst)
Then I fit it to the training data and get these same errors:
mu, sigma, pi = fit_generative_model(train_data, train_labels)
20 Model makes 1 errors out of 10000 [1]
200 Model makes 1 errors out of 10000 [1, 1]
4000 Model makes 1 errors out of 10000 [1, 1, 1]
and to the test data:
mu, sigma, pi = fit_generative_model(test_data, test_labels)
20 Model makes 9020 errors out of 10000 [9020]
200 Model makes 9020 errors out of 10000 [9020, 9020]
4000 Model makes 9020 errors out of 10000 [9020, 9020, 9020]
What is it I'm doing wrong? the correct answer is c=4000 which yields an error of ~4.3%.

Argument must be a string or a number issue, Not 'Type' - Pyspark

Update:
So i have been looking into the issue, the problem is with scikit-multiflow datastream. in last quarter of code stream_clf.partial_fit(X,y, classes=stream.target_values) here the class valuefor stream.target_values should a number or string, but the method is returning (dtype). When i print or loop stream.target_values i get this:
I have tried to do conversion etc. but still of no use. can someone please help here ?
Initial Problem
I am running a code (took inspiration from here). It works perfectly alright when used vanilla python environment.
But if i run this code after certain modification in Apache Spark using Pyspark , i get the following error
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
I have tried every possibile way to trace the issue but everything looks alright. The error arises from the last line of the code where hoefding tree is called for prediction. It expects an ndarray and the type of X variable is also ndarray. I am not sure what is trigerring the issue. Can some one please help or direct me to right trace?
complete stack of error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-52-1310132c88db> in <module>
30 D3_win.addInstance(X,y)
31 xx = np.array(X,dtype='float64')
---> 32 y_hat = stream_clf.predict(xx)
33
34
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict(self, X)
1068 r, _ = get_dimensions(X)
1069 predictions = []
-> 1070 y_proba = self.predict_proba(X)
1071 for i in range(r):
1072 index = np.argmax(y_proba[i])
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict_proba(self, X)
1099 votes = normalize_values_in_dict(votes, inplace=False)
1100 if self.classes is not None:
-> 1101 y_proba = np.zeros(int(max(self.classes)) + 1)
1102 else:
1103 y_proba = np.zeros(int(max(votes.keys())) + 1)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
Code
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import functions as fn
import sys
from pyspark import SparkContext,SparkConf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score as AUC
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skmultiflow.trees.hoeffding_tree import HoeffdingTree
from skmultiflow.data.data_stream import DataStream
import time
def drift_detector(S,T,threshold = 0.75):
T = pd.DataFrame(T)
#print(T)
S = pd.DataFrame(S)
# Give slack variable in_target which is 1 for old and 0 for new
T['in_target'] = 0 # in target set
S['in_target'] = 1 # in source set
# Combine source and target with new slack variable
ST = pd.concat( [T, S], ignore_index=True, axis=0)
labels = ST['in_target'].values
ST = ST.drop('in_target', axis=1).values
# You can use any classifier for this step. We advise it to be a simple one as we want to see whether source
# and target differ not to classify them.
clf = LogisticRegression(solver='liblinear')
predictions = np.zeros(labels.shape)
# Divide ST into two equal chunks
# Train LR on a chunk and classify the other chunk
# Calculate AUC for original labels (in_target) and predicted ones
skf = StratifiedKFold(n_splits=2, shuffle=True)
for train_idx, test_idx in skf.split(ST, labels):
X_train, X_test = ST[train_idx], ST[test_idx]
y_train, y_test = labels[train_idx], labels[test_idx]
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)[:, 1]
predictions[test_idx] = probs
auc_score = AUC(labels, predictions)
print(auc_score)
# Signal drift if AUC is larger than the threshold
if auc_score > threshold:
return True
else:
return False
class D3():
def __init__(self, w, rho, dim, auc):
self.size = int(w*(1+rho))
self.win_data = np.zeros((self.size,dim))
self.win_label = np.zeros(self.size)
self.w = w
self.rho = rho
self.dim = dim
self.auc = auc
self.drift_count = 0
self.window_index = 0
def addInstance(self,X,y):
if(self.isEmpty()):
self.win_data[self.window_index] = X
self.win_label[self.window_index] = y
self.window_index = self.window_index + 1
else:
print("Error: Buffer is full!")
def isEmpty(self):
return self.window_index < self.size
def driftCheck(self):
if drift_detector(self.win_data[:self.w], self.win_data[self.w:self.size], auc): #returns true if drift is detected
self.window_index = int(self.w * self.rho)
self.win_data = np.roll(self.win_data, -1*self.w, axis=0)
self.win_label = np.roll(self.win_label, -1*self.w, axis=0)
self.drift_count = self.drift_count + 1
return True
else:
self.window_index = self.w
self.win_data = np.roll(self.win_data, -1*(int(self.w*self.rho)), axis=0)
self.win_label =np.roll(self.win_label, -1*(int(self.w*self.rho)), axis=0)
return False
def getCurrentData(self):
return self.win_data[:self.window_index]
def getCurrentLabels(self):
return self.win_label[:self.window_index]
def select_data(x):
x = "/user/hadoop1/tellus/sea_1.csv"
peopleDF = spark.read.csv(x, header= True)
df = peopleDF.toPandas()
scaler = MinMaxScaler()
df.iloc[:,0:df.shape[1]-1] = scaler.fit_transform(df.iloc[:,0:df.shape[1]-1])
return df
def check_true(y,y_hat):
if(y==y_hat):
return 1
else:
return 0
df = select_data("/user/hadoop1/tellus/sea_1.csv")
stream = DataStream(df)
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(2000)
rho = float(0.4)
auc = float(0.60)
# In[ ]:
D3_win = D3(w,rho,stream.n_features,auc)
stream_acc = []
stream_record = []
stream_true= 0
i=0
start = time.time()
X,y = stream.next_sample(int(w*rho))
stream_clf.partial_fit(X,y, classes=stream.target_values)
while(stream.has_more_samples()):
X,y = stream.next_sample()
if D3_win.isEmpty():
D3_win.addInstance(X,y)
y_hat = stream_clf.predict(X)
Problem was with select_data() function, data type of variables was being changed during the execution. This issue is fixed now.

BERT binary Textclassification get different results every run

I do binary text classification with BERT from the Simpletransformer.
I work in Colab with GPU runtime type.
I have generated train and test set with the sklearn StratifiedKFold Method. I have two files with the dictionaries containing my folds.
I run my classification in the following while loop:
from sklearn.metrics import matthews_corrcoef, f1_score
import sklearn
counter = 0
resultatos = []
while counter != len(trainfolds):
model = ClassificationModel('bert', 'bert-base-multilingual-cased',args={'num_train_epochs': 4, 'learning_rate': 1e-5, 'fp16': False,
'max_seq_length': 160, 'train_batch_size': 24,'eval_batch_size': 24 ,
'warmup_ratio': 0.0,'weight_decay': 0.00,
'overwrite_output_dir': True})
print("start with fold_{}".format(counter))
trainfolds["{}_fold".format(counter)].to_csv("/content/data/train.tsv", sep="\t", index = False, header=False)
print("{}_fold Train als train.tsv exportiert". format(counter))
testfolds["{}_fold".format(counter)].to_csv("/content/data/dev.tsv", sep="\t", index = False, header=False)
print("{}_fold test als train.tsv exportiert". format(counter))
train_df = pd.read_csv("/content/data/train.tsv", delimiter='\t', header=None)
eval_df = df = pd.read_csv("/content/data/dev.tsv", delimiter='\t', header=None)
train_df = pd.DataFrame({
'text': train_df[3].replace(r'\n', ' ', regex=True),
'label':train_df[1]})
eval_df = pd.DataFrame({
'text': eval_df[3].replace(r'\n', ' ', regex=True),
'label':eval_df[1]})
model.train_model(train_df)
result, model_outputs, wrong_predictions = model.eval_model(eval_df, f1 = sklearn.metrics.f1_score)
print(result)
resultatos.append(result)
shutil.rmtree("outputs")
shutil.rmtree("cache_dir")
#shutil.rmtree("runs")
counter += 1
And i get different Results Running this code for the same Folds:
Here for example the F1 Scores for two runs:
0.6237942122186495
0.6189111747851003
0.6172839506172839
0.632183908045977
0.6182965299684542
0.5942492012779553
0.6025641025641025
0.6153846153846154
0.6390532544378699
0.6627906976744187
The F1 Score is: 0.6224511646974427
0.6064516129032258
0.6282420749279539
0.6402439024390244
0.5971014492753622
0.6135693215339232
0.6191950464396285
0.6382978723404256
0.6388059701492537
0.6097560975609756
0.5956112852664576
The F1 Score is: 0.618727463283623
How can they be that diffeerent for the same folds?
What i tried already is give a fixed Random seed right before my loop starts:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
I came up with approach of having the Model initialized in the loop because, when its outside the loop, it somehow remembers what it has learned - that means after the 2nd fold I get f1 score of almost one - despite the fact that i delete the cache..
I figured it out myself, just set all seeds plus torch.backends.cudnn.deterministic = True
and
torch.backends.cudnn.benchmark = False
like shown in this post and i get the same results for all runs!

Pad data using tf.data.Dataset

I have to use tf.data.Dataset for creating a input pipeline for an RNN model in tensorflow. I am providing a basic code, by which I need to pad the data in batch with a pad token and use it for further manipulation.
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen,(tf.int32,tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
print(sess.run(iter2.get_next()))
I hope the code is self explanatory with comments. But I am getting following error,
InvalidArgumentError (see above for traceback): Cannot batch tensors with different shapes in component 0. First element had shape [11] and element 1 had shape [12].
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,?], [?]], output_types=[DT_INT32, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]
I believe that since your generator yields two outputs, your padded_shapes and padded_values tuples must have a length of two. For me, this works:
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32))
dataset = dataset.batch(batch_size=25)
padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None])) # sentence of unknown size
padding_values = (tf.constant(-111), tf.constant(-111)) # the value with which pad index needs to be filled
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=padding_values)
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
Finally got the answer. The issue was for the second padded shapes instead of Tensorshape([None]), we should provide [], because the second item returned by the generator is a scalar. If using Tensorshape([None]),, make sure we are returning a vector
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
total_data_size = 10000
embedding_dimension = 25
max_len = 17
varying_length = np.random.randint(max_len, size=(10000)) # varying length data
X = np.array([np.random.randint(1000, size=(value)).tolist()for index, value in enumerate(varying_length)]) # data of arying length
Y = np.random.randint(2, size=(total_data_size)).astype(np.int32) # target binary
embedding = np.random.uniform(-1,1,(1000, embedding_dimension)) # word embedding
def gen():
for index in range(len(X)):
yield X[index] , Y[index]
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), (tf.TensorShape([None]), []))
padded_shapes = (tf.TensorShape([None]), []) # sentence of unknown size
dataset = (dataset
.padded_batch(25, padded_shapes=padded_shapes, padding_values=(-111, 0))
)
iter2 = dataset.make_initializable_iterator()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(iter2.initializer)
sess.run(iter2.get_next())

Resources