BERT - The truth value of a DataFrame is ambiguous - nlp

I am getting into deep learning for some of my models and I am running into issues. I wanted to get it to work simply without any adjustments in the data, but I got
Graph execution error:
followed by a bunch of lines like
File "C:\Users\rober\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
I also tried oversampling in order to get this to work but still no luck. Instead I get
The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Anyways here is my code for this part of my project:
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics import classification_report
import pandas as pd
import time
from sklearn.model_selection import train_test_split
tart = time.perf_counter()
df = pd.read_excel(r'F:\Documents\Graduate Research\RMGmail2YrsClassified.xlsx')
pd.set_option('display.max_columns', None)
print("Lets start by looking at the top of the dataframe.")
print(df.head(10))
print(df.groupby('Classification').describe())
print(type(df['Classification']))
#independent
#join into 1 column to do analysis
df['Text']= df['Subject'].astype(str)+ ' ' +df['Body'].astype(str)
#1D array required for vectorizer for DF1 (not in the scope of this)
X = df['Text']
df = df.drop(['Subject','Body','Email Frequency', 'Recieved_Time','Account','Sender_Email'], axis=1)
#DF 2 for Deep Learning
df2 =df
#classification needs to be int
df2['Classification']=df2['Classification'].astype(int)
X1 = df2['Text']
y1 = df2['Classification']
print("df2 is:")
print(df2)
df_Primary = df2[df2['Classification']==1]
print(df_Primary.shape)
df_Secondary = df2[df2['Classification']==2]
print(df_Secondary.shape)
df_Social = df2[df2['Classification']==3]
print(df_Social.shape)
df_Promotional = df2[df2['Classification']==4]
print(df_Promotional.shape)
#df_Primary = df2[df2['Classification']==1]
#print(df_Primary.shape)
df_Primary_oversampled = df_Primary.sample(df_Promotional, replace=True)
print(df_Primary_oversampled.shape)
df_Secondary_oversampled = df_Secondary.sample(df_Promotional, replace=True)
print(df_Primary_oversampled.shape)
df_Social_oversampled = df_Social.sample(df_Promotional, replace=True)
print(df_Primary_oversampled.shape)
df_balanced = pd.concat([df_Primary_oversampled, df_Secondary_oversampled,
df_Social_oversampled,df_Promotional])
print(df_balanced.shape)
#change to preprocessed when removing stop words
X2 = df_balanced['Text']
y2 = df_balanced['Category']
X_trainDLT, X_testDLT, y_trainDLT, y_testDLT = train_test_split(
X2, y2, test_size=0.3,stratify=df['Classification'], random_state=50)
X_trainDLV, X_testDLV, y_trainDLV, y_testDLV = train_test_split(
X1, y1, test_size=0.3,stratify=df['Classification'], random_state=50)
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
# Use inputs and outputs to construct a final model
BERT = tf.keras.Model(inputs=[text_input], outputs = [l])
print(BERT.summary())
print(len(X_trainDLT))
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
BERT.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=METRICS)
BERT.fit(X_trainDLT, y_trainDLT, epochs=10)
y_predBERT = BERT.predict(X_testDLV)
y_predBERT = y_predBERT.flatten()
print("Bert")
print(classification_report(y_testDLV, y_predBERT))
I also read I shouldn't use oversampled or synthetically created samples in testing thus the 2 train test splits. In addition for my Y value "Classification" is represented in 1-4 for respective mail categories (primary, secondary, social, promotional)

Related

using a list as a value in pandas.DataFeame and tensorFlow

Im tring to use list as a value in pandas.DataFrame
but Im getting Exception when trying to use use the adapt function in on the Normalization layer with the NumPy array
this is the error:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
and this is the code:
import pandas as pd
import numpy as np
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow.keras import layers
data = [[45.975, 45.81, 45.715, 45.52, 45.62, 45.65, 4],
[55.67, 55.975, 55.97, 56.27, 56.23, 56.275, 5],
[86.87, 86.925, 86.85, 85.78, 86.165, 86.165, 3],
[64.3, 64.27, 64.285, 64.29, 64.325, 64.245, 6],
[35.655, 35.735, 35.66, 35.69, 35.665, 35.63, 5]
]
lables = [0, 1, 0, 1, 1]
def do():
d_1 = None
for l, d in zip(lables, data):
if d_1 is None:
d_1 = pd.DataFrame({'lable': l, 'close_price': [d]})
else:
d_1 = d_1.append({'lable': l, 'close_price': d}, ignore_index=True)
dataset = d_1.copy()
print(dataset.isna().sum())
dataset = dataset.dropna()
print(dataset.keys())
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print(train_dataset.describe().transpose())
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('lable')
test_labels = test_features.pop('lable')
print(train_dataset.describe().transpose()[['mean', 'std']])
normalizer = tf.keras.layers.Normalization(axis=-1)
ar = np.array(train_features)
normalizer.adapt(ar)
print(normalizer.mean.numpy())
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
diraction = np.array(train_features)
diraction_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
diraction_normalizer.adapt(diraction)
diraction_model = tf.keras.Sequential([
diraction_normalizer,
layers.Dense(units=1)
])
print(diraction_model.summary())
print(diraction_model.predict(diraction[:10]))
diraction_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error')
print(train_features['close_price'])
history = diraction_model.fit(
train_features['close_price'],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2)
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
test_results = {}
test_results['diraction_model'] = diraction_model.evaluate(
test_features,
test_labels, verbose=0)
x = tf.linspace(0.0, 250, 251)
y = diraction_model.predict(x)
print("end")
def main():
do()
if __name__ == "__main__":
main()
I think it is not the usual practice to shrink your features into one column.
Quick-fix is you may put the following line
train_features = np.array(train_features['close_price'].to_list())
before
normalizer = tf.keras.layers.Normalization(axis=-1)
to get rid of the error, but now because your train_features has changed from a DataFrame into a np.array, your subsequent code may suffer, so you need to take care of that too.
If I were you, however, I would have constructed the DataFrame this way
df = pd.DataFrame(data)
df['label'] = lables
Please consider.

Predicting classes in MNIST dataset with a Gaussian- the same prediction errors with different paramemters?

I am trying to find the best c parameter following the instructions to a task that asks me to ' Define a function, fit_generative_model, that takes as input a training set (train_data, train_labels) and fits a Gaussian generative model to it. It should return the parameters of this generative model; for each label j = 0,1,...,9, where
pi[j]: the frequency of that label
mu[j]: the 784-dimensional mean vector
sigma[j]: the 784x784 covariance matrix
It is important to regularize these matrices. The standard way of doing this is to add cI to them, where c is some constant and I is the 784-dimensional identity matrix. c is now a parameter, and by setting it appropriately, we can improve the performance of the model.
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import gzip, os
import numpy as np
from scipy.stats import multivariate_normal
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# Downloads the dataset
def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
print("Downloading %s" % filename)
urlretrieve(source + filename, filename)
# Invokes download() if necessary, then reads in images
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1,784)
return data
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
return data
## Load the training set
train_data = load_mnist_images('train-images-idx3-ubyte.gz')
train_labels = load_mnist_labels('train-labels-idx1-ubyte.gz')
## Load the testing set
test_data = load_mnist_images('t10k-images-idx3-ubyte.gz')
test_labels = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
train_data.shape, train_labels.shape
So I have written this code for three different C-values. they each give me the same error?
def fit_generative_model(x,y):
lst=[]
for c in [20,200, 4000]:
k = 10 # labels 0,1,...,k-1
d = (x.shape)[1] # number of features
mu = np.zeros((k,d))
sigma = np.zeros((k,d,d))
pi = np.zeros(k)
for label in range(0,k):
indices = (y == label)
mu[label] = np.mean(x[indices,:], axis=0)
sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1) + c*np.identity(784) # I define the identity matrix
predictions = np.argmax(score, axis=1)
errors = np.sum(predictions != y)
lst.append(errors)
print(c,"Model makes " + str(errors) + " errors out of 10000", lst)
Then I fit it to the training data and get these same errors:
mu, sigma, pi = fit_generative_model(train_data, train_labels)
20 Model makes 1 errors out of 10000 [1]
200 Model makes 1 errors out of 10000 [1, 1]
4000 Model makes 1 errors out of 10000 [1, 1, 1]
and to the test data:
mu, sigma, pi = fit_generative_model(test_data, test_labels)
20 Model makes 9020 errors out of 10000 [9020]
200 Model makes 9020 errors out of 10000 [9020, 9020]
4000 Model makes 9020 errors out of 10000 [9020, 9020, 9020]
What is it I'm doing wrong? the correct answer is c=4000 which yields an error of ~4.3%.

Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined,

I am trying to define func(x) in order to use the genetic algs library here:
https://github.com/bobirdmi/genetic-algorithms/tree/master/examples
However, when I try and use sga.init_random_population(population_size, params, interval) the code complains of me using tf.Tensors as python bools.
However, I am only referencing one bool in the entire code (Elitism) so I have no idea why this error is even showing. Asked around others who used sga.init_... and my inputs/setup is fine. Any suggestions would be greatly appreciated.
Full traceback:
Traceback (most recent call last):
File "C:\Users\Eric\eclipse-workspace\hw1\ga2.py", line 74, in <module>
sga.init_random_population(population_size, params, interval)
File "C:\Program Files\Python36\lib\site-packages\geneticalgs\real_ga.py", line 346, in init_random_population
self._sort_population()
File "C:\Program Files\Python36\lib\site-packages\geneticalgs\standard_ga.py", line 386, in _sort_population
self.population.sort(key=lambda x: x.fitness_val, reverse=True)
File "C:\Program Files\Python36\lib\site-packages\tensorflow\python\framework\ops.py", line 671, in __bool__
raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. "
TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.
code
import hw1
#import matplotlib
from geneticalgs import BinaryGA, RealGA, DiffusionGA, MigrationGA
#import numpy as np
#import csv
#import time
#import pickle
#import math
#import matplotlib.pyplot as plt
from keras.optimizers import Adam
from hw1 import x_train, y_train, x_test, y_test
from keras.losses import mean_squared_error
#import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
# GA standard settings
generation_num = 50
population_size = 16
elitism = True
selection = 'rank'
tournament_size = None # in case of tournament selection
mut_type = 1
mut_prob = 0.05
cross_type = 1
cross_prob = 0.95
optim = 'min' # minimize or maximize a fitness value? May be 'min' or 'max'.
interval = (-1, 1)
# Migration GA settings
period = 5
migrant_num = 3
cloning = True
def func(x):
#dimensions of weights and biases
#layer0weights = [10][23]
#layer0biases = [10]
#layer1weights = [10][20]
#layer1biases = [20]
#layer2weights = [1][20]
#layer2biases = [1]
#split up x for weights and biases
lay0 = x[0:230]
bias0 = x[230:240]
lay1 = x[240:440]
bias1 = x[440:460]
lay2 = x[460:480]
bias2 = x[480:481]
#fit to the shape of the actual model
lay0 = lay0.reshape(23,10)
bias0 = bias0.reshape(10,)
lay1 = lay1.reshape(10,20)
bias1 = bias1.reshape(20,)
lay2 = lay2.reshape(20,1)
bias2 = bias2.reshape(1,)
#set the newly shaped object to layers
hw1.model.layers[0].set_weights([lay0, bias0])
hw1.model.layers[1].set_weights([lay1, bias1])
hw1.model.layers[2].set_weights([lay2, bias2])
res = hw1.model.predict(x_train)
error = mean_squared_error(res,y_train)
return error
ga_model = Sequential()
ga_model.add(Dense(10, input_dim=23, activation='relu'))
ga_model.add(Dense(20, activation='relu'))
ga_model.add(Dense(1, activation='sigmoid'))
sga = RealGA(func, optim=optim, elitism=elitism, selection=selection,
mut_type=mut_type, mut_prob=mut_prob,
cross_type=cross_type, cross_prob=cross_prob)
params = 481
sga.init_random_population(population_size, params, interval)
optimal = sga.best_solution[0]
predict = func(optimal)
print(predict)
Tensorflow generates a computational graph of operations to be executed in an Tensorflow session.
geneticalgs.RealGA.init_random_population is an operation that uses the numpy.random.uniform to generate a numpy array. 1
The generated population being a Tensor object could mean maybe:
numpy.random.uniform invoked in geneticalgs.RealGA.init_random_population was decorated to return Tensors
numpy.random.uniform was added in the computation graph to be executed in a session.
I'll try executing the program eagerly by enabling eager execution. 2
tf.enable_execution()
You can also in a way execute the parts that you care about eagerly.
size = tf.placeholder(tf.int64)
dim = tf.placeholder(tf.int64)
interval = tf.placeholder(tf.int64, shape=(2,))
init_random_population = tf.py_func(
sga.init_random_population, [size, dim, interval], [])
with tf.Session() as session:
session.run(
init_random_population,
{size: population_size, dim: params, interval: interval})

Outputting coefficients when running linear regression using sklearn

I'm attempting to run a simple linear regression on a data set and retrieve the coefficients. The data which is from a a .csv file looks like:
"","time","LakeHuron"
"1",1875,580.38
"2",1876,581.86
"3",1877,580.97
"4",1878,580.8
...
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
def Main():
location = r"~/Documents/Time Series/LakeHuron.csv"
ts = pd.read_csv(location, sep=",", parse_dates=[0], header=None)
ts.drop(ts.columns[[0]], axis=1, inplace=True)
length = len(ts)
x = ts[1].values
y = ts[2].values
x = x.reshape(length, 1)
y = y.reshape(length, 1)
regr = linear_model.LinearRegression()
regr.fit(x, y)
print(regr.coef_)
if __name__ == "__main__":
Main()
Since this is a simple linear model then $Y_t = a_0 + a_1*t$, which in this case should be $Y_t = 580.202 -0.0242t$. and all that prints out when running the above code is [[-0.02420111]]. Is there anyway to get the second coefficient 580.202?
I've had a look at the documentation on http://scikit-learn.org/stable/modules/linear_model.html and it outputs two variables in the array.
Look like you only have one X and one Y, So output is correct.
Try this:
#coef_ : array, shape (n_features, ) or (n_targets, n_features)
print(regr.coef_)
#intercept_ : array Independent term in the linear model.
print(regr.intercept_)
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

Wrong intercept in Spark linear regression

I am starting with Spark Linear Regression. I am trying to fit a line to a linear dataset. It seems that the intercept is not correctly adjusting, or probably I am missing something..
With intercept=False:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=False)
This seems normal. But when I use intercept=True:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=True)
The model that I get in the last case is exactly:
(weights=[0.0353471289751], intercept=1.0005127185289888)
I have tried with different datasets, step sizes and iterations, but always the model converges the intercept is about 1
EDIT - This is the code I am using:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
import numpy as np
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc = SparkContext("local", "regression")
# Generate data
SIZE = 300
SLOPE = 0.1
BASE = -30
NOISE = 10
x = np.arange(SIZE)
delta = np.random.uniform(-NOISE,NOISE, size=(SIZE,))
y = BASE + SLOPE*x + delta
data = zip(range(len(y)), y) # zip with index
dataRDD = sc.parallelize(data)
# Normalize data
# mean = np.mean(data)
# std = np.std(data)
# dataRDD = dataRDD.map(lambda r: (r[0], (float(r[1])-mean)/std))
labeledData = dataRDD.map(lambda r: LabeledPoint(float(r[1]), [float(r[0])]))
# Create linear model
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=1000, step=0.0002, intercept=True, convergenceTol=0.000001)
print linear_model
true_vs_predicted = labeledData.map(lambda p: (p.label, linear_model.predict(p.features))).collect()
# PLOT
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid()
y_real = [x[0] for x in true_vs_predicted]
y_pred = [x[1] for x in true_vs_predicted]
plt.plot(range(len(y_real)), y_real, 'o', markersize=5, c='b')
plt.plot(range(len(y_pred)), y_pred, 'o', markersize=5, c='r')
plt.show()
This is because the number of iterations and the step size are both smaller. As a result, The trial process is ending before reaching the local optima.

Resources