I have a dataset with Russian text, which looks like this:
I am trying to pre-process this dataset and split it to train,dev and testing datasets by using the following code:
# coding=utf-8
import os
import argparse
import xml.etree.ElementTree as ET
import random
import math
from collections import Counter
from utils import semeval2014term_to_aspectsentiment_hr
from copy import copy, deepcopy
parser = argparse.ArgumentParser(description='Generate finetuning corpus for restaurants.')
parser.add_argument('--noconfl',
action='store_true',
default=False,
help='Remove conflicting sentiments from labels')
parser.add_argument('--istrain',
action='store_true',
default=False,
help='If is a training set we split of 10% and output train_full, train_split, dev. Default is testset creating no split')
parser.add_argument("--files",
type=str,
nargs='+',
action="store",
help="File that contains the data used for training. Multiple paths will mix the datasets.")
parser.add_argument("--output_dir",
type=str,
action="store",
default="data/transformed/untitled",
help="output dir of the dataset(s)")
parser.add_argument("--upsample",
type=str,
action="store",
default=None,
help="please add a string with 3 numbers like '0.5 0.3 0.2' representing relative numbers of 'POS NEG NEU' adding to 1"
" which represents target distribution - only valid in non-confl case")
parser.add_argument("--seed",
type=int,
action="store",
default=41,
help="random seed, effects on upsampling and validationset")
args = parser.parse_args()
# 1. Load The Dataset
# 2. Create Bert-Pair Style Format
# 3. Save Train, Validation and so on
def split_shuffle_array(ratio, array, rseed):
# split_ratio_restaurant = .076 # for 150 sentence in conflicting case
# split_ratio_laptops = .101 # for 150 sentences in conflicting case
random.Random(rseed).shuffle(array)
m = math.floor(ratio * len(array))
return array[0:m], array[m::]
def create_sentence_pairs(sents, aspect_term_sentiments):
# create sentence_pairs
all_sentiments = []
sentence_pairs = []
labels = []
for ix, ats in enumerate(aspect_term_sentiments):
s = sents[ix]
for k, v in ats:
all_sentiments.append(v)
sentence_pairs.append((s, k))
labels.append(v)
counts = Counter(all_sentiments)
return sentence_pairs, labels, counts
def upsample_data(sentence_pairs, labels, target_ratios={'POS': 0.53, 'NEG': 0.21, 'NEU': 0.26}):
# one question: should we upsample sentencepairs, where the sentence only occurs once?!
print('Upsampling data ...')
# print(sentence_pairs, labels) # is list of pairs -> decide which pair to upsample ...
# 0. compute indeex subsets for every example
# 1. compute how many samples to sample ->
ix_subsets = {
'POS': [],
'NEG': [],
'NEU': []
}
ratios_subsets = {
'POS': 0,
'NEG': 0,
'NEU': 0
}
examples_to_add = {
'POS': 0,
'NEG': 0,
'NEU': 0
}
n = float(len(labels))
for ix, l in enumerate(labels):
ix_subsets[l].append(ix)
ratios_subsets[l] += (1.0 / n)
t_keys = target_ratios.keys()
tmp = [math.floor(target_ratios[k] * n) - len(ix_subsets[k]) for k in t_keys]
class_nothing_to_add = list(t_keys)[tmp.index(min(tmp))]
print(t_keys)
print(ratios_subsets)
print(tmp)
print(class_nothing_to_add)
# print(ix_subsets)
m = len(ix_subsets[class_nothing_to_add]) / target_ratios[class_nothing_to_add]
total_to_add = m - n
print(n, math.floor(m))
examples_to_add = {k: math.floor(target_ratios[k] * m - len(ix_subsets[k])) for k in t_keys}
print(examples_to_add) # so we need to add more neutral examples and more positiev ones
# downsampling would be set 0 the maximum amount of negative ones
# now select all the indices, with replacement because it can be more than double
new_samples = []
for k in t_keys:
new_samples.extend(random.Random(args.seed).choices(ix_subsets[k], k=examples_to_add[k]))
print(len(new_samples))
# now add all new samples to the dataset and shuffle it
new_sentence_pairs = copy(sentence_pairs)
new_labels = labels.copy()
for ix in new_samples:
new_sentence_pairs.append(copy(sentence_pairs[ix]))
new_labels.append(labels[ix])
random.Random(args.seed).shuffle(new_sentence_pairs)
random.Random(args.seed).shuffle(new_labels)
print(len(set(new_sentence_pairs)))
print(len(set(sentence_pairs)))
return new_sentence_pairs, new_labels
def export_dataset_to_xml(fn, sentence_pairs, labels):
# export in format semeval 2014, incomplete though! just for loading with existing dataloaders for ATSC
sentences_el = ET.Element('sentences')
sentimap_reverse = {
'POS': 'positive',
'NEU': 'neutral',
'NEG': 'negative',
'CONF': 'conflict'
}
for ix, (sentence, aspectterm) in enumerate(sentence_pairs):
# print(sentence)
sentiment = labels[ix]
sentence_el = ET.SubElement(sentences_el, 'sentence')
sentence_el.set('id', str(ix))
text = ET.SubElement(sentence_el, 'text')
text.text = str(sentence).strip()
aspect_terms_el = ET.SubElement(sentence_el, 'aspectTerms')
aspect_term_el = ET.SubElement(aspect_terms_el, 'aspectTerm')
aspect_term_el.set('term', aspectterm)
aspect_term_el.set('polarity', sentimap_reverse[sentiment])
aspect_term_el.set('from', str('0'))
aspect_term_el.set('to', str('0'))
def indent(elem, level=0):
i = "\n" + level * " "
j = "\n" + (level - 1) * " "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for subelem in elem:
indent(subelem, level + 1)
if not elem.tail or not elem.tail.strip():
elem.tail = j
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = j
return elem
indent(sentences_el)
# mydata = ET.dump(sentences_el)
mydata = ET.tostring(sentences_el)
with open(fn, "wb") as f:
# f.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
f.write(mydata)
f.close()
def save_dataset_to_tsv(fn, data):
pass
sentence_pairs_train_mixed = []
sentence_pairs_trainsplit_mixed = []
sentence_pairs_dev_mixed = []
sentence_pairs_test_mixed = []
labels_train_mixed = []
labels_trainsplit_mixed = []
labels_dev_mixed = []
labels_test_mixed = []
for fn in args.files:
print(args.output_dir)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
print(fn)
sents_train, ats_train, idx2labels = semeval2014term_to_aspectsentiment_hr(fn,
remove_conflicting=args.noconfl)
sentence_pairs_train, labels_train, counts_train = create_sentence_pairs(sents_train, ats_train)
if args.istrain:
sents_dev, sents_trainsplit = split_shuffle_array(.1, sents_train, 41)
ats_dev, ats_trainsplit = split_shuffle_array(.1, ats_train, 41)
sentence_pairs_dev, labels_dev, counts_dev = create_sentence_pairs(sents_dev, ats_dev)
sentence_pairs_trainsplit, labels_trainsplit, counts_trainsplit = create_sentence_pairs(sents_trainsplit,
ats_trainsplit)
print_dataset_stats('Train', sents_train, sentence_pairs_train, counts_train)
print_dataset_stats('Dev', sents_dev, sentence_pairs_dev, counts_dev)
print_dataset_stats('TrainSplit', sents_trainsplit, sentence_pairs_trainsplit, counts_trainsplit)
sentence_pairs_trainsplit_mixed += sentence_pairs_trainsplit
sentence_pairs_train_mixed += sentence_pairs_train
sentence_pairs_dev_mixed += sentence_pairs_dev
labels_trainsplit_mixed += labels_trainsplit
labels_train_mixed += labels_train
labels_dev_mixed += labels_dev
if len(args.files) == 1:
if args.upsample:
distro_arr = args.upsample.split(' ')
pos = float(distro_arr[0])
neg = float(distro_arr[1])
neu = float(distro_arr[2])
assert pos + neg + neu == 1.0, 'upsampling target distribution does not sum to 1'
target_distro = {'POS': pos, 'NEG': neg, 'NEU': neu}
print('Target Sampling Distribution for Training Set:', target_distro)
sentence_pairs_train, labels_train = upsample_data(sentence_pairs_train, labels_train, target_ratios=target_distro)
export_dataset_to_xml(args.output_dir + '/train.xml', sentence_pairs_train, labels_train)
export_dataset_to_xml(args.output_dir + '/dev.xml', sentence_pairs_dev, labels_dev)
export_dataset_to_xml(args.output_dir + '/train_split.xml', sentence_pairs_trainsplit, labels_trainsplit)
else:
sentence_pairs_test_mixed += sentence_pairs_train
labels_test_mixed += labels_train
print_dataset_stats('Test', sents_train, sentence_pairs_train, counts_train)
if len(args.files) == 1:
export_dataset_to_xml(args.output_dir + '/test.xml', sentence_pairs_train, labels_train)
if len(args.files) > 1:
if args.istrain:
export_dataset_to_xml(args.output_dir + '/train.xml', sentence_pairs_train_mixed, labels_train_mixed)
export_dataset_to_xml(args.output_dir + '/dev.xml', sentence_pairs_dev_mixed, labels_dev_mixed)
export_dataset_to_xml(args.output_dir + '/train_split.xml', sentence_pairs_trainsplit_mixed,
labels_trainsplit_mixed)
else:
export_dataset_to_xml(args.output_dir + '/test.xml', sentence_pairs_test_mixed, labels_test_mixed)
After running the code above I have this result:
For English text it works just fine. Could someone help me to fix this and get normal text?
ET.tostring(sentences_el, encoding='UTF-8')
Related
I'm using a custom batch generator with large dataframe. but the Generator takes too much time to generate a batch, it takes 127s to generate a batch of 1024. I've tried Dask but still, the processing is slow. is there any way to integrate multiprocessing with inside the generator. knowing that I've tried use_multiprocessing=True with workers=12
import keras
from random import randint
import glob
import warnings
import numpy as np
import math
import pandas as pd
import dask.dataframe as dd
class BatchGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, labels=None, batch_size=8, n_classes=4, shuffle=True,
seq_len=6, data_path=None, meta_path=None,list_IDs=None):
'Initialization'
self.batch_size = batch_size
self.labels = labels
self.n_classes = n_classes
self.shuffle = shuffle
self.seq_len = seq_len
self.meta_df = meta_path
self.data_df = data_path
self.data_df = self.data_df.astype({"mjd": int})
self.list_IDs = list_IDs
if self.list_IDs==None:
self.list_IDs = list(self.meta_df['object_id'].unique())
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
X_dat = np.zeros((self.batch_size, self.seq_len,6,1))
Y_mask = np.zeros((self.batch_size, self.seq_len,6,1))
# Y_dat = np.empty((self.batch_size,1), dtype=int)
X_length= np.empty((self.batch_size,1), dtype=int)
for i, trans_id in enumerate(list_IDs_temp):
curve = self.data_df[self.data_df.object_id==trans_id]
mjdlist = list(curve['mjd'].unique())
ts_length = len(mjdlist)
if ts_length <= self.seq_len :
start_ind = 0
else :
start_ind = randint(0, ts_length - self.seq_len)
ts_length = self.seq_len
for j in range(ts_length):
if j+start_ind < len(mjdlist):
step = curve[curve.mjd==mjdlist[j+start_ind]]
for k in range(len(step.mjd)):
obs = step[step.passband==k]
if len(obs) == 0 :
# print('here is one')
continue
else:
if k == 0:
X_dat[i,j,0,0] =obs.flux.iloc[0]
Y_mask[i,j,0,0] = 1
if k == 1:
X_dat[i,j,1,0] = obs.flux.iloc[0]
Y_mask[i,j,1,0] = 1
if k == 2:
X_dat[i,j,2,0] = obs.flux.iloc[0]
Y_mask[i,j,2,0] = 1
if k == 3:
X_dat[i,j,3,0] = obs.flux.iloc[0]
Y_mask[i,j,3,0] = 1
if k == 4:
X_dat[i,j,4,0] = obs.flux.iloc[0]
Y_mask[i,j,4,0] = 1
if k == 5:
X_dat[i,j,5,0] = obs.flux.iloc[0]
Y_mask[i,j,5,0] = 1
# meta = self.meta_df[self.meta_df['object_id'] == trans_id]
# Y_dat[i] = self.labels[int(meta['target'])]
X_length[i,0] = ts_length
flux_max = np.max(X_dat[i])
flux_min = np.min(X_dat[i])
flux_pow = math.log2(flux_max - flux_min)
X_dat[i] /= flux_pow
X_noised = X_dat + np.random.uniform(low=0, high=0.5, size=X_dat.shape)
return [X_noised, X_length, np.reshape(Y_mask,(self.batch_size, self.seq_len*6))], np.reshape(X_dat,(self.batch_size, self.seq_len*6))
To make it faster, the for loop in the function __data_generation should be parallelized. Using the joblib package may help.
Here I have my Python code code, I don't understand why I am getting the following error. Any guidance or help would be much appreciated.
UnboundLocalError: local variable 'top_performer' referenced before assignment
def create(X, y, **kwargs):
method = kwargs.get("method", None)
#method = kwargs.get("method", "Binary_operators")
#method = kwargs.get("method", "Binning")
#method = kwargs.pop("method", "Cluster")
#categorical_cols = [c for c, t in zip(X.columns, X_column_types) if t in [DATATYPE_CATEGORY_INT, DATATYPE_CATEGORY_STRING]]
#numerical_cols = [c for c, t in zip(X.columns, X_column_types) if t == DATATYPE_NUMBER]
#categorical = X[categorical_cols]
#numerical = X[numerical_cols]
categorical = X.select_dtypes(include=[object])
numerical = X.select_dtypes(exclude=[object])
# feature selection using Genetic Algorithm
if method == "fs_GA":
print("fs_GA")
enc = OneHotEncoder()
enc.fit(categorical)
Data_cat=pd.DataFrame(enc.transform(categorical).toarray())
X_data = pd.concat([numerical, Data_cat], axis=1)
if y.dtype == int:
y = y
else:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_data, y, train_size=0.8, random_state=42)
def get_fitness(individual):
if y.dtype == int:
rg = RandomForestRegressor(random_state=42)
else:
rg = RandomForestClassifier(random_state=42)
columns = [column for (column, binary_value) in zip(X_train.columns, individual) if binary_value]
training_set = X_train[columns]
test_set = X_test[columns]
rg.fit(training_set.values, y_train)
preds = rg.predict(test_set.values)
return 100 / np.sqrt(mean_squared_error(y_test, preds))
individual = [1] * 100
get_fitness(individual)
def get_population_fitness(population):
return sorted([(individual, get_fitness(individual)) for individual in population], key=lambda tup: tup[1], reverse=True)
def crossover(individual_a, individual_b):
crossing_point = random.randint(0, 99)
offspring_a = individual_a[0:crossing_point] + individual_b[crossing_point:100]
offspring_b = individual_b[0:crossing_point] + individual_a[crossing_point:100]
return offspring_a, offspring_b
def tournament(current_population):
index = sorted(random.sample(range(0, 20), 5))
tournament_members = [current_population[i] for i in index]
total_fitness = sum([individual[1] for individual in tournament_members])
probabilities = [individual[1] / total_fitness for individual in tournament_members]
index_a, index_b = np.random.choice(5, size=2, p=probabilities)
return crossover(tournament_members[index_a][0], tournament_members[index_b][0])
def mutation(individual):
mutation_point = random.randint(0, 99)
if(individual[mutation_point]):
individual[mutation_point] = 0
else:
individual[mutation_point] = 1
def build_next_generation(current_population, mutation_rate):
next_generation = []
next_generation.append(current_population[0][0]) # elitism
next_generation.append(current_population[random.randint(1,19)][0]) # randomness
for i in range(9): # tournaments
offspring_a, offspring_b = tournament(current_population)
next_generation.append(offspring_a)
next_generation.append(offspring_b)
for individual in next_generation: # mutation
if(random.randint(1,mutation_rate) == 1):
mutation(individual)
return next_generation
def run_ga(current_population, num_of_generations, mutation_rate=1000):
fittest_individuals = []
for i in range(num_of_generations):
current_population = get_population_fitness(current_population) # get pop fitness
fittest_individuals.append(current_population[0]) # record fittest individual (for graphing and analysis)
current_population = build_next_generation(current_population, mutation_rate) # make new population
return fittest_individuals
initial_population = [[random.randint(0, 1) for i in range(100)] for i in range(20)]
high_mutation_fittest = run_ga(initial_population, 100, mutation_rate=5)
high_mutation_fitness = [ind[1] for ind in high_mutation_fittest]
for item in high_mutation_fittest[:-1]:
if item[1] == max(high_mutation_fitness):
top_performer = item
break
print("Total features included: " + str(top_performer[0].count(1)))
selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
excluded_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if not binary_value]
X = X[selected_features]
if method == "Binary_operators":
print("binaryoperators")
if method == "Binning":
print("binning")
else:
print("Discretization")
if method == "Cluster":
print("clustering")
else:
print("no-cluster")
print("normal_autocross")
So when I run the code I get the following error and I don't seem to understand what it means. Can someone please explain to me why i'm getting this error?
create(X, y, method="fs_GA")
fs_GA
UnboundLocalError Traceback (most recent call last)
in
----> 1 create(X, y, method="fs_GA")
in create(X, y, **kwargs)
107 top_performer = item
108 break
--> 109 print("Total features included: " + str(top_performer[0].count(1)))
110
111 selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
UnboundLocalError: local variable 'top_performer' referenced before assignment
top_performer = 0
for item in high_mutation_fittest[:-1]:
if item[1] == max(high_mutation_fitness):
top_performer = item
break
print("Total features included: " + str(top_performer[0].count(1)))
According to your code top_performer is an int variable, not an array, str(top_performer) is correct way of using it. str(top_performer).count('1') , this could be what you are looking for. count is for string not int
Loading data into dataset using pytorch dataloader.
Getting error cannot import name 'read_data_sets'
Tried searaching for results from similar issues.
If there is confusion about file instead of module and it can't find read_data_sets in your file How do i change to fix?
class MRDataset(data.Dataset):
def __init__(self, root_dir, task, plane, train=True, transform=None, weights=None):
super().__init__()
self.task = task
self.plane = plane
self.root_dir = root_dir
self.train = train
if self.train:
self.folder_path = self.root_dir + 'train/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'train-{0}.csv'.format(task), header=None, names=['id', 'label'])
else:
transform = None
self.folder_path = self.root_dir + 'valid/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'valid-{0}.csv'.format(task), header=None, names=['id', 'label'])
self.records['id'] = self.records['id'].map(
lambda i: '0' * (4 - len(str(i))) + str(i))
self.paths = [self.folder_path + filename +
'.npy' for filename in self.records['id'].tolist()]
self.labels = self.records['label'].tolist()
self.transform = transform
if weights is None:
pos = np.sum(self.labels)
neg = len(self.labels) - pos
self.weights = torch.FloatTensor([1, neg / pos])
else:
self.weights = torch.FloatTensor(weights)
def __len__(self):
return len(self.paths)
def __getitem__(self, index):
array = np.load(self.paths[index])
label = self.labels[index]
if label == 1:
label = torch.FloatTensor([[0, 1]])
elif label == 0:
label = torch.FloatTensor([[1, 0]])
if self.transform:
array = self.transform(array)
else:
array = np.stack((array,)*3, axis=1)
array = torch.FloatTensor(array)
# if label.item() == 1:
# weight = np.array([self.weights[1]])
# weight = torch.FloatTensor(weight)
# else:
# weight = np.array([self.weights[0]])
# weight = torch.FloatTensor(weight)
return array, label, self.weights
There is a model and train class to run this. Arguments specified in train.
Running the train should load data and run through model
I am trying to modify a triangular arbitrage crypto trading bot to include a predictive capability with a neural network. I've found some open source algorithms on GitHub, but I am having problem integrating them.
I've been trying to separate parts of the code into modules and using a continuously updated csv file to direct the data from the first half of the algorithm into the second, but it just isn't working.
I tried to create modules for different parts of the algorithm, but it didn't work:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import Bidirectional
from keras.models import Sequential
from binance.client import Client
from binance.enums import *
from sklearn.metrics import mean_squared_error
import time
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from pandas import Series
from matplotlib import cm
api_key = BinanceKey1['api_key']
api_secret = BinanceKey1['api_secret']
client = Client(api_key, api_secret)
import csv
import json
from binance.client import Client
import csv
import json
from binance.client import Client
client = Client(api_key, api_secret)
client = Client("", "")
klines1 = client.get_historical_klines("BNBBTC", Client.KLINE_INTERVAL_1MINUTE,
"1 day ago UTC")
csv.write(klines1)
# fetch 30 minute klines for the last month of 2017
klines2 = client.get_historical_klines("ETHBTC", Client.KLINE_INTERVAL_30MINUTE,
"1 Dec, 2017", "1 Jan, 2018")
csv.write(klines2)
# fetch weekly klines since it listed
klines3 = client.get_historical_klines("NEOBTC", Client.KLINE_INTERVAL_1WEEK,
"1 Jan, 2017")
csv.write(klines3)
def load_data(klines, sequence_length):
raw_data = pd.read_csv(klines, dtype=float).values
for x in range(0, raw_data.shape[0]):
for y in range(0, raw_data.shape[1]):
if(raw_data[x][y] == 0):
raw_data[x][y] = raw_data[x-1][y]
data = raw_data.tolist()
result = []
for index in range(len(data) - sequence_length):
result.append(data[index: index + sequence_length])
d0 = np.array(result)
dr = np.zeros_like(d0)
dr[:, 1:, :] = d0[:, 1:, :] / d0[:, 0:1, :] - 1
start = 2400
end = int(dr.shape[0] + 1)
unnormalized_bases = d0[start:end, 0:1, 20]
split_line = round(0.9 * dr.shape[0])
training_data = dr[:int(split_line), :]
np.random.shuffle(training_data)
X_train = training_data[:, :-1]
Y_train = training_data[:, -1]
Y_train = Y_train[:, 20]
X_test = dr[int(split_line):, :-1]
Y_test = dr[int(split_line):, 49, :]
Y_test = Y_test[:, 20]
Y_daybefore = dr[int(split_line):, 48, :]
Y_daybefore = Y_daybefore[:, 20]
sequence_length = sequence_length
window_size = sequence_length - 1
return X_train, Y_train, X_test, Y_test, Y_daybefore, unnormalized_bases, window_size
def initialize_model(window_size, dropout_value, activation_function, loss_function, optimizer):
model = Sequential()
model.add(Bidirectional(LSTM(window_size, return_sequences=True), input_shape=(window_size, X_train.shape[-1]),))
model.add(Dropout(dropout_value))
model.add(Bidirectional(LSTM((window_size*2), return_sequences=True)))
model.add(Dropout(dropout_value))
model.add(Bidirectional(LSTM(window_size, return_sequences=False)))
model.add(Dense(units=1))
model.add(Activation(activation_function))
model.compile(loss=loss_function, optimizer=optimizer)
return model
def fit_model(model, X_train, Y_train, batch_num, num_epoch, val_split):
start = time.time()
model.fit(X_train, Y_train, batch_size= batch_num, nb_epoch=num_epoch, validation_split= val_split)
training_time = int(math.floor(time.time() - start))
return model, training_time
def test_model(model, X_test, Y_test, unnormalized_bases):
y_predict = model.predict(X_test)
real_y_test = np.zeros_like(Y_test)
real_y_predict = np.zeros_like(y_predict)
for i in range(Y_test.shape[0]):
y = Y_test[i]
predict = y_predict[i]
real_y_test[i] = (y+1)*unnormalized_bases[i]
real_y_predict[i] = (predict+1)*unnormalized_bases[i]
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
ax.set_title("Bitcoin Price Over Time")
plt.plot(real_y_predict, color = 'green', label = 'Predicted Price')
plt.plot(real_y_test, color = 'red', label = 'Real Price')
ax.set_ylabel("Price (USD)")
ax.set_xlabel("Time (Days)")
ax.legend()
return y_predict, real_y_test, real_y_predict, fig
def price_change(Y_daybefore, Y_test, y_predict):
Y_daybefore = np.reshape(Y_daybefore, (-1, 1))
Y_test = np.reshape(Y_test, (-1, 1))
delta_predict = (y_predict - Y_daybefore) / (1+Y_daybefore)
delta_real = (Y_test - Y_daybefore) / (1+Y_daybefore)
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.set_title("Percent Change in Bitcoin Price Per Day")
plt.plot(delta_predict, color='green', label = 'Predicted Percent Change')
plt.plot(delta_real, color='red', label = 'Real Percent Change')
plt.ylabel("Percent Change")
plt.xlabel("Time (Days)")
ax.legend()
plt.show()
return Y_daybefore, Y_test, delta_predict, delta_real, fig
def binary_price(delta_predict, delta_real):
delta_predict_1_0 = np.empty(delta_predict.shape)
delta_real_1_0 = np.empty(delta_real.shape)
for i in range(delta_predict.shape[0]):
if delta_predict[i][0] > 0:
delta_predict_1_0[i][0] = 1
else:
delta_predict_1_0[i][0] = 0
for i in range(delta_real.shape[0]):
if delta_real[i][0] > 0:
delta_real_1_0[i][0] = 1
else:
delta_real_1_0[i][0] = 0
return delta_predict_1_0, delta_real_1_0
def find_positives_negatives(delta_predict_1_0, delta_real_1_0):
true_pos = 0
false_pos = 0
true_neg = 0
false_neg = 0
for i in range(delta_real_1_0.shape[0]):
real = delta_real_1_0[i][0]
predicted = delta_predict_1_0[i][0]
if real == 1:
if predicted == 1:
true_pos += 1
else:
false_neg += 1
elif real == 0:
if predicted == 0:
true_neg += 1
else:
false_pos += 1
return true_pos, false_pos, true_neg, false_neg
def calculate_statistics(true_pos, false_pos, true_neg, false_neg, y_predict, Y_test):
precision = float(true_pos) / (true_pos + false_pos)
recall = float(true_pos) / (true_pos + false_neg)
F1 = float(2 * precision * recall) / (precision + recall)
MSE = mean_squared_error(y_predict.flatten(), Y_test.flatten())
return precision, recall, F1, MSE
X_train, Y_train, X_test, Y_test, Y_daybefore, unnormalized_bases, window_size = load_data("Bitcoin Data.csv", 50)
print (X_train.shape)
print (Y_train.shape)
print (X_test.shape)
print (Y_test.shape)
print (Y_daybefore.shape)
print (unnormalized_bases.shape)
print (window_size)
model = initialize_model(window_size, 0.2, 'linear', 'mse', 'adam')
print model.summary()
model, training_time = fit_model(model, X_train, Y_train, 1024, 100, .05)
print "Training time", training_time, "seconds"
y_predict, real_y_test, real_y_predict, fig1 = test_model(model, X_test, Y_test, unnormalized_bases)
plt.show(fig1)
Y_daybefore, Y_test, delta_predict, delta_real, fig2 = price_change(Y_daybefore, Y_test, y_predict)
plt.show(fig)
delta_predict_1_0, delta_real_1_0 = binary_price(delta_predict, delta_real)
print delta_predict_1_0.shape
print delta_real_1_0.shape
true_pos, false_pos, true_neg, false_neg = find_positives_negatives(delta_predict_1_0, delta_real_1_0)
print "True positives:", true_pos
print "False positives:", false_pos
print "True negatives:", true_neg
print "False negatives:", false_neg
precision, recall, F1, MSE = calculate_statistics(true_pos, false_pos, true_neg, false_neg, y_predict, Y_test)
print "Precision:", precision
print "Recall:", recall
print "F1 score:", F1
print "Mean Squared Error:", MSE
class Client(object):
API_URL = 'https://api.binance.com/api'
WITHDRAW_API_URL = 'https://api.binance.com/wapi'
WEBSITE_URL = 'https://www.binance.com'
PUBLIC_API_VERSION = 'v1'
PRIVATE_API_VERSION = 'v3'
WITHDRAW_API_VERSION = 'v3'
SYMBOL_TYPE_SPOT = 'SPOT'
ORDER_STATUS_NEW = 'NEW'
ORDER_STATUS_PARTIALLY_FILLED = 'PARTIALLY_FILLED'
ORDER_STATUS_FILLED = 'FILLED'
ORDER_STATUS_CANCELED = 'CANCELED'
ORDER_STATUS_PENDING_CANCEL = 'PENDING_CANCEL'
ORDER_STATUS_REJECTED = 'REJECTED'
ORDER_STATUS_EXPIRED = 'EXPIRED'
KLINE_INTERVAL_1MINUTE = '1m'
KLINE_INTERVAL_3MINUTE = '3m'
KLINE_INTERVAL_5MINUTE = '5m'
KLINE_INTERVAL_15MINUTE = '15m'
KLINE_INTERVAL_30MINUTE = '30m'
KLINE_INTERVAL_1HOUR = '1h'
KLINE_INTERVAL_2HOUR = '2h'
KLINE_INTERVAL_4HOUR = '4h'
KLINE_INTERVAL_6HOUR = '6h'
KLINE_INTERVAL_8HOUR = '8h'
KLINE_INTERVAL_12HOUR = '12h'
KLINE_INTERVAL_1DAY = '1d'
KLINE_INTERVAL_3DAY = '3d'
KLINE_INTERVAL_1WEEK = '1w'
KLINE_INTERVAL_1MONTH = '1M'
SIDE_BUY = 'BUY'
SIDE_SELL = 'SELL'
ORDER_TYPE_LIMIT = 'LIMIT'
ORDER_TYPE_MARKET = 'MARKET'
ORDER_TYPE_STOP_LOSS = 'STOP_LOSS'
ORDER_TYPE_STOP_LOSS_LIMIT = 'STOP_LOSS_LIMIT'
ORDER_TYPE_TAKE_PROFIT = 'TAKE_PROFIT'
ORDER_TYPE_TAKE_PROFIT_LIMIT = 'TAKE_PROFIT_LIMIT'
ORDER_TYPE_LIMIT_MAKER = 'LIMIT_MAKER'
TIME_IN_FORCE_GTC = 'GTC'
TIME_IN_FORCE_IOC = 'IOC'
TIME_IN_FORCE_FOK = 'FOK'
ORDER_RESP_TYPE_ACK = 'ACK'
ORDER_RESP_TYPE_RESULT = 'RESULT'
ORDER_RESP_TYPE_FULL = 'FULL'
AGG_ID = 'a'
AGG_PRICE = 'p'
AGG_QUANTITY = 'q'
AGG_FIRST_TRADE_ID = 'f'
AGG_LAST_TRADE_ID = 'l'
AGG_TIME = 'T'
AGG_BUYER_MAKES = 'm'
AGG_BEST_MATCH = 'M'
def run():
initialize_arb()
pass
def initialize_arb():
welcome_message = "\n\n---------------------------------------------------------\n\n"
welcome_message+= "Hello and Welcome to the Binance Arbitrage Crypto Trader Bot Python Script\nCreated 2018 by Joaquin Roibal (#BlockchainEng)"
welcome_message+= "A quick 'run-through' will be performed to introduce you to the functionality of this bot\n"
welcome_message+="To learn more visit medium.com/#BlockchainEng or watch introductory Youtube Videos"
welcome_message+="\nCopyright 2018 by Joaquin Roibal\n"
bot_start_time = str(datetime.now())
welcome_message+= "\nBot Start Time: {}\n\n\n".format(bot_start_time)
print(welcome_message)
data_log_to_file(welcome_message)
time.sleep(5)
try:
status = client.get_system_status()
list_of_symbols = ['ETHBTC', 'BNBETH', 'BNBBTC']
list_of_symbols2 = ['ETHUSDT', 'BNBETH', 'BNBUSDT']
list_of_symbols3 = ['BTCUSDT', 'BNBBTC', 'BNBUSDT']
list_of_arb_sym = [list_of_symbols, list_of_symbols2, list_of_symbols3]
tickers = client.get_orderbook_tickers()
portfolio=[]
with open('Portfolio.txt') as f1:
read_data = f1.readlines()
for line in read_data:
load_portfolio = line
load_portfolio = list(load_portfolio[1:-1].split(','))
i=0
for val in load_portfolio:
if i == 4:
portfolio.append(str(datetime.now()))
break
portfolio.append(float(val))
i+=1
portf_msg = "Starting Portfolio: " + str(portfolio)
print(portf_msg)
portf_file_save(portfolio)
data_log_to_file(portf_msg)
while 1:
calc_profit_list =[]
for arb_market in list_of_arb_sym:
calc_profit_list.append(arbitrage_bin(arb_market, tickers, portfolio, 1, 1))
for profit1 in calc_profit_list:
data_log_to_file(str(profit1))
print(calc_profit_list)
exp_profit = 0
m = n = 0
for exch_market in calc_profit_list:
if exch_market[4]>exp_profit:
exp_profit = exch_market[4]
m = n
n+=1
profit_message = "\nMost Profitable Market: {} \nExpected Profit: {}%".format(list_of_arb_sym[m], exp_profit)
print(profit_message)
data_log_to_file(profit_message)
time.sleep(5)
arb_list_data = []
arb_start_time = str(datetime.now())
for i in range(0,5):
arb_list_data.append(arbitrage_bin(list_of_arb_sym[m], tickers, portfolio, 1, 1, 'Yes'))
time.sleep(30)
arb_end_time = str(datetime.now())
viz_arb_data(arb_list_data, list_of_arb_sym[m], arb_start_time, arb_end_time)
except:
print("\nFAILURE INITIALIZE\n")
def data_log_to_file(message):
with open('CryptoTriArbBot_DataLog.txt', 'a+') as f:
f.write(message)
def portf_file_save(portfolio):
with open('Portfolio.txt', 'a+') as f:
f.write('\n'+str(portfolio))
def arbitrage_bin(list_of_sym, tickers, portfolio, cycle_num=10, cycle_time=30, place_order='No'):
arb_message = "Beginning Binance Arbitrage Function Data Collection - Running\n"
print(arb_message)
data_log_to_file(arb_message)
time.sleep(2)
fee_percentage = 0.05
for i in range(0,1):
"""
pairs = []
for sym in symbols:
for symbol in coins:
if symbol in sym:
pairs.append(sym)
print(pairs)
#From Coin 1 to Coin 2 - ETH/BTC - Bid
#From Coin 2 to Coin 3 - ETH/LTC - Ask
#From Coin 3 to Coin 1 - BTC/LTC - Bid
arb_list = ['ETH/BTC'] #, 'ETH/LTC', 'BTC/LTC']
#Find 'closed loop' of currency rate pairs
j=0
while 1:
if j == 1:
final = arb_list[0][-3:] + '/' + str(arb_list[1][-3:])
print(final)
#if final in symbols:
arb_list.append(final)
break
for sym in symbols:
if sym in arb_list:
pass
else:
if j % 2 == 0:
if arb_list[j][0:3] == sym[0:3]:
if arb_list[j] == sym:
pass
else:
arb_list.append(sym)
print(arb_list)
j+=1
break
if j % 2 == 1:
if arb_list[j][-3:] == sym[-3:]:
if arb_list[j] == sym:
pass
else:
arb_list.append(sym)
print(arb_list)
j+=1
break
"""
print("List of Arbitrage Symbols:", list_of_sym)
list_exch_rate_list = []
if 1:
for k in range(0,cycle_num):
i=0
exch_rate_list = []
data_collect_message1 = "Data Collection Cycle Number: "+str(k) +'\n'
print(data_collect_message1)
data_log_to_file(data_collect_message1)
for sym in list_of_sym:
currency_pair = "Currency Pair: "+str(sym)+"\n"
print(currency_pair)
data_log_to_file(currency_pair)
if sym in list_of_sym:
"""if i == 0: #For first in triangle
depth = client.get_order_book(symbol=sym)
exch_rate_list.append(float(depth['bids'][0][0]))
print(depth['bids'][0][0])
"""
if i % 2==0:
depth = client.get_order_book(symbol=sym)
inv1 = depth['asks'][0][0]
exch_rate_list.append(float(inv1))
Exch_rate1 = "Exchange Rate: {}".format(depth['asks'][0][0]) +'\n'
print(Exch_rate1)
data_log_to_file(Exch_rate1)
if i == 1:
depth = client.get_order_book(symbol=sym)
inv2 = round(1.0/float(depth['bids'][0][0]),6)
exch_rate_list.append(float(inv2))
Exch_rate2 = "Exchange Rate: {}".format(depth['bids'][0][0])+'\n'
print(Exch_rate2)
data_log_to_file(Exch_rate2)
i+=1
else:
exch_rate_list.append(0)
exch_rate_list.append(datetime.now())
rate1 = exch_rate_list[0]
buy_price = "Buy: {}\n".format(rate1)
print(buy_price)
data_log_to_file(buy_price)
rate2 = float(exch_rate_list[2])*float(exch_rate_list[1])
sell_price = "Sell: {}\n".format(rate2)
print(sell_price)
data_log_to_file(sell_price)
if float(rate1)<float(rate2):
arb_1_msg = "Arbitrage Possibility - "
arb_profit = round((float(rate2)-float(rate1))/float(rate2)*100,3)
arb_1_msg += "Potential Profit (Percentage): "+str(arb_profit) +'%\n'
print(arb_1_msg)
data_log_to_file(arb_1_msg)
exch_rate_list.append(arb_profit)
if place_order == 'Yes':
place_order_msg = "PLACING ORDER"
print(place_order_msg)
data_log_to_file(place_order_msg)
portfolio = tri_arb_paper(portfolio, list_of_sym, exch_rate_list)
portf_file_save(portfolio)
else:
arb_2_msg = "No Arbitrage Possibility"
print(arb_2_msg)
data_log_to_file(arb_2_msg)
exch_rate_list.append(0)
exch_msg = "Exchange Rate List: " +str(exch_rate_list)+'\n'
print(exch_msg)
data_log_to_file(exch_msg)
time.sleep(cycle_time)
print('\nARBITRAGE FUNCTIONALITY SUCCESSFUL - Data of Exchange Rates Collected\n')
return exch_rate_list
def tri_arb_paper(portfolio1, sym_list, list_exch_rates):
tri_arb_paper_msg = "\nSTARTING TRI ARB PAPER TRADING FUNCTION\n"
print(tri_arb_paper_msg)
time.sleep(10)
data_log_to_file(tri_arb_paper_msg)
if sym_list[0][-3:]=='BTC':
portf_pos = 0
elif sym_list[0][-3:]=='ETH':
portf_pos = 1
elif sym_list[0][-3:]=='SDT':
portf_pos = 2
elif sym_list[0][-3:]=='BNB':
portf_pos = 3
start_amount = float(portfolio1[portf_pos])
amt_coin2 = start_amount / float(list_exch_rates[0])
amt_coin3 = amt_coin2 * float(list_exch_rates[1])
final_amount = amt_coin3 * float(list_exch_rates[2])
tri_arb_paper_msg = "Starting Amount: "+str(sym_list[0][-3:])+" "+str(start_amount)+'\n'
tri_arb_paper_msg += "Amount Coin 2: "+str(sym_list[0][0:3])+" "+str(amt_coin2)+'\n'
tri_arb_paper_msg += "Amount Coin 3: "+str(sym_list[2][0:3])+" "+str(amt_coin3) +'\n'
tri_arb_paper_msg += "Final Amount: "+str(sym_list[0][-3:])+" "+str(final_amount)+'\n'
print(tri_arb_paper_msg)
data_log_to_file(tri_arb_paper_msg)
portfolio1[portf_pos] = final_amount
portfolio1[-1] = str(datetime.now())
return portfolio1
def viz_arb_data(list_exch_rate_list, arb_market, start_time, end_time):
viz_msg = "RUNNING ARBITRAGE VISUALIZATION FUNCTIONALITY"
print(viz_msg)
data_log_to_file(viz_msg)
rateA = []
rateB = []
rateB_fee = []
price1 = []
price2 = []
time_list = []
profit_list = []
for rate in list_exch_rate_list:
rateA.append(rate[0])
rateB1 = round(float(rate[1])*float(rate[2]),6)
rateB.append(rateB1)
price1.append(rate[1])
price2.append(rate[2])
profit_list.append(rate[4])
time_list.append(rate[3])
viz_msg2 = "Rate A: {} \n Rate B: {} \n Projected Profit (%): {} ".format(rateA, rateB, profit_list) #rateB_fee))
print(viz_msg2)
data_log_to_file(viz_msg2)
fig, host = plt.subplots()
fig.subplots_adjust(right=0.75)
par1 = host.twinx()
par2 = host.twinx()
par2.spines["right"].set_position(("axes", 1.2))
make_patch_spines_invisible(par2)
par2.spines["right"].set_visible(True)
p1, = host.plot(time_list, rateA, "k", label = "{}".format(arb_market[0]))
p1, = host.plot(time_list, rateB, "k+", label = "{} * {}".format(arb_market[1], arb_market[2]))
p2, = par1.plot(time_list, price1, "b-", label="Price - {}".format(arb_market[1]))
p3, = par2.plot(time_list, price2, "g-", label="Price - {}".format(arb_market[2]))
host.set_xlabel("Time")
host.set(title='Triangular Arbitrage - Exchange: {}\nStart Time: {}\n End Time: {}\n'
'Copyright (c) 2018 #BlockchainEng'.format('Binance', start_time, end_time))
host.set_ylabel("Exchange Rate")
par1.set_ylabel("Price - {}".format(arb_market[1]))
par2.set_ylabel("Price - {}".format(arb_market[2]))
host.yaxis.label.set_color(p1.get_color())
tkw = dict(size=4, width=1.5)
host.tick_params(axis='y', colors=p1.get_color(), **tkw)
par1.tick_params(axis='y', colors=p2.get_color(), **tkw)
par2.tick_params(axis='y', colors=p3.get_color(), **tkw)
host.tick_params(axis='x', **tkw)
lines = [p1, p2, p3]
host.legend(lines, [l.get_label() for l in lines])
fname = "Binance_Test.png"
plt.savefig(fname)
""", dpi=None, facecolor='w', edgecolor='w',
orientation='portrait', papertype=None, format=None,
transparent=False, bbox_inches=None, pad_inches=0.1,
frameon=None)"""
print_figure_message = "Data Collected Figure Printed & Saved - " + str(fname)
print(print_figure_message)
data_log_to_file(print_figure_message)
def make_patch_spines_invisible(ax):
ax.set_frame_on(True)
ax.patch.set_visible(False)
for sp in ax.spines.values():
sp.set_visible(False)
"""
def market_depth(sym, num_entries=20):
#Get market depth
#Retrieve and format market depth (order book) including time-stamp
i=0 #Used as a counter for number of entries
#print("Order Book: ", convert_time_binance(client.get_server_time()))
depth = client.get_order_book(symbol=sym)
print(depth)
print(depth['asks'][0])
ask_tot=0.0
ask_price =[]
ask_quantity = []
bid_price = []
bid_quantity = []
bid_tot = 0.0
place_order_ask_price = 0
place_order_bid_price = 0
max_order_ask = 0
max_order_bid = 0
print("\n", sym, "\nDepth ASKS:\n")
print("Price Amount")
for ask in depth['asks']:
if i<num_entries:
if float(ask[1])>float(max_order_ask):
#Determine Price to place ask order based on highest volume
max_order_ask=ask[1]
place_order_ask_price=round(float(ask[0]),5)-0.0001
#ask_list.append([ask[0], ask[1]])
ask_price.append(float(ask[0]))
ask_tot+=float(ask[1])
ask_quantity.append(ask_tot)
#print(ask)
i+=1
j=0 #Secondary Counter for Bids
print("\n", sym, "\nDepth BIDS:\n")
print("Price Amount")
for bid in depth['bids']:
if j<num_entries:
if float(bid[1])>float(max_order_bid):
#Determine Price to place ask order based on highest volume
max_order_bid=bid[1]
place_order_bid_price=round(float(bid[0]),5)+0.0001
bid_price.append(float(bid[0]))
bid_tot += float(bid[1])
bid_quantity.append(bid_tot)
#print(bid)
j+=1
return ask_price, ask_quantity, bid_price, bid_quantity, place_order_ask_price, place_order_bid_price
#Plot Data
"""
if __name__ == "__main__":
run()
Ideally, the code is supposed to find arbitrage opportunities in predicted price changes and execute orders accordingly.
import os
import tarfile
from six.moves import urllib
URL = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
PATH = 'aclImdb'
def fetch_data(url = URL, path = PATH):
if not os.path.isdir(path):
os.makedirs(path)
file_path = os.path.join(oath, "aclImdb_v1.tar.gz")
urllib.request.urlretrieve(url, file_path)
file_gz = tarfile.open(file_path)
file_gz.extractall(path = path)
file_gz.close()
import pyprind # for progress visualisation
import pandas as pd
PATH = 'aclImdb'
labels = {'pos': 1, 'neg': 0} # int class labels for 'positive' and 'negative'
pbar = pyprind.ProgBar(50000) # initialise a progress bar with 50k iterations = no. of docs
df = pd.DataFrame()
# use nested for loops to iterate over 'train' & 'test' subdir
for s in ('test', 'train'):
for l in ('pos', 'neg'): # and read text files from 'pos' and 'neg' subdir
path = os.path.join(PATH, s, l)
for file in os.listdir(path):
# append to the df pandas DataFrame with an int class (post = 1, neg = 0)
with open(os.path.join(path, file), 'r', encoding = 'utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index = True)
pbar.update()
df.columns = ['review', 'sentiment']
import numpy as np
np. random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')
n_words = max(list(word_to_int.values())) + 1
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head(3)
# Separate words and count each word's occurence
import pyprind # for progress visualisation
from collections import Counter
from string import punctuation
import re
counts = Counter() # collects the counts of occurence of each unique word
pbar = pyprind.ProgBar(len(df['review']),
title = 'Counting word occurences...') # progress bar
for i, review in enumerate(df['review']):
text = ''.join([c if c not in punctuation else ' '+c+' '
for c in review]).lower()
df.loc[i, 'review'] = text
pbar.update()
counts.update(text.split())
# Mapping each unique word to an int
word_counts = sorted(counts, key = counts.get, reverse = True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}
mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']),
title = 'Map movie reviews to integers...')
# Left-pad with zeros if the sequence length < 200
# Use 200 elements if the length > 200
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype = int)
for i, row in enumerate(mapped_reviews):
review_arr = np.array(row)
sequences[i, -len(row):] = review_arr[-sequence_length:]
# Split the dataset into training and test sets
X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values
# Define the mini-batches generator
np.random.seed(123)
def batch_gen(x, y = None, batch_size = 64):
n_batches = len(x) // batch_size
x = x[:n_batches * batch_size]
if y is not None:
y = y[:n_batches * batch_size]
for ii in range(0, len(x), batch_size):
if y is not None:
yield x[ii : ii + batch_size], y[ii : ii + batch_size]
else:
yield x[ii : ii + batch_size]
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' ## suppress the 3.5 warning if using TF 1.4
class SentimentRNN(object):
# Define __init__
def __init__(self,
n_words,
seq_len = 200,
lstm_size = 256,
num_layers = 1,
batch_size = 64,
learning_rate = 0.0001,
embed_size = 200):
self.n_words = n_words
self.seq_len = seq_len
self.lstm_size = lstm_size # no. of hidden units
self.num_layers = num_layers
self.batch_size = batch_size
self.learning_rate = learning_rate
self.embed_size = embed_size
self.g = tf.Graph()
with self.g.as_default():
tf.set_random_seed(123)
self.build()
self.saver = tf.train.Saver()
self.init_op = tf.global_variables_initializer()
# Define the build method
def build(self):
# Define the placeholders
tf_x = tf.placeholder(tf.int32,
shape = (self.batch_size, self.seq_len),
name = 'tf_x')
tf_y = tf.placeholder(tf.float32,
shape = (self.batch_size),
name = 'tf_y')
tf_keepprob = tf.placeholder(tf.float32,
name = 'tf_keepprob')
# Create the embedding layer
embedding = tf.Variable(
tf.random_uniform(
shape = (self.n_words, self.embed_size),
minval = -1,
maxval = 1),
name = 'embedding')
embed_x = tf.nn.embedding_lookup(embedding,
tf_x,
name = 'embed_x')
# Define LSTM cells and stack them
cells = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.BasicLSTMCell(num_units = self.lstm_size),
output_keep_prob = tf_keepprob)
for i in range(self.num_layers)])
# Define the initial state:
self.initial_state = cells.zero_state(
self.batch_size, tf.float32)
print(' << initial state >> ', self.initial_state)
# Put together components with tf.nn.dynamic_rnn
lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
cell = cells,
inputs = embed_x,
initial_state = self.initial_state)
## lstm_outputs shape: [batch_size, max_time, cells.output_size]
print('\n << lstm_output >> ', lstm_outputs)
print('\n << final state >> ', self.final_state)
# Apply a full-connected layer on the RNN output
logits = tf.layers.dense(
inputs = lstm_outputs[:, -1],
units = 1, # dimensionality of the output space
activation = None,
name = 'logits')
# Remove dimensions of size 1 from the tensor shape
logits = tf.squeeze(input = logits,
name = 'logits_squeezed')
print ('\n << logits >> ', logits)
# If you want prob's
y_proba = tf.nn.sigmoid(logits, name = 'probabilities')
predictions = {'probabilities' : y_proba,
'labels' : tf.cast(tf.round(y_proba),
tf.int32,
name = 'labels')}
print('\n << predictions >> ', predictions)
# Define the cost function
cost = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
labels = tf_y,
logits = logits),
name = 'cost')
# Define the optimiser
optimizer = tf.train.AdamOptimizer(self.learning_rate)
train_op = optimizer.minimize(cost, name = 'train_op')
# Define the train method
def train(self, X_train, y_train, num_epochs):
with tf.Session(graph = self.g) as sess:
sess.run(self.init_op)
iteration = 1
for epoch in range(num_epochs):
state = sess.run(self.initial_state)
for batch_x, batch_y in batch_gen(
X_train,
y_train,
batch_size = self.batch_size):
feed = {'tf_x:0' : batch_x,
'tf_y:0' : batch_y,
'tf_keepprob:0' : 0.5,
self.initial_state : state}
loss, _, state = sess.run(
['cost:0',
'train_op',
self.final_state],
feed_dict=feed)
if iteration % 20 == 0:
print("Epoch: %d/%d Iteration: %d "
"| Train loss: %.5f" % (
epoch + 1,
num_epochs,
iteration,
loss))
iteration += 1
if (epoch + 1) % 10 == 0:
self.saver.save(
sess,
"model/sentiment-%d.ckpt" % epoch)
# Define the predict method
def predict(self, X_data, return_proba=False):
preds = []
with tf.Session(graph = self.g) as sess:
self.saver.restore(
sess,
tf.train.latest_checkpoint('model/'))
test_state = sess.run(self.initial_state)
for ii, batch_x in enumerate(batch_gen(
x = X_data,
y = None,
batch_size = self.batch_size), 1):
feed = {'tf_x:0' : batch_x,
'tf_keepprob:0' : 1.0,
self.initial_state : test_state}
if return_proba:
pred, test_state = sess.run(
['probabilities:0', self.final_state],
feed_dict=feed)
else:
pred, test_state = sess.run(
['labels:0', self.final_state],
feed_dict=feed)
preds.append(pred)
return np.concatenate(preds)
for review in df['review']:
mapped_reviews.append([word_to_int[word] for word in review.split()])
pbar.update()
rnn = SentimentRNN(n_words = n_words,
seq_len = sequence_length,
embed_size = 256,
lstm_size = 128,
num_layers = 1,
batch_size = 100,
learning_rate = 0.001)
preds = rnn.predict(X_test)
y_true = y_test\[:len(preds)\]
print('Test accuracy... %.3f' % (np.sum(preds == y_true) / len(y_true)))][1]
Create an object of the SentimentRNN class with the following parameters:
n_words = n_words, seq_len = sequence_length, embed_size = 256, lstm_size = 128, num_layers = 1, batch_size = 100, learning_rate = 0.001.
Since we have a relatively small dataset, the number of layers = 1 may generalise better
enter image description here
ValueError Traceback (most recent call last)
<ipython-input-23-a3cfe03a9a49> in <module>()
----> 1 preds = rnn.predict(X_test)
2 y_true = y_test[:len(preds)]
3 print('Test accuracy... %.3f' % (np.sum(preds == y_true) / len(y_true)))
<ipython-input-12-d83ee67c43b6> in predict(self, X_data, return_proba)
173 self.saver.restore(
174 sess,
--> 175 tf.train.latest_checkpoint('model/'))
176 test_state = sess.run(self.initial_state)
177
/usr/local/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py in restore(self, sess, save_path)
1680 return
1681 if save_path is None:
-> 1682 raise ValueError("Can't load save_path when it is None.")
1683 logging.info("Restoring parameters from %s", save_path)
1684 if context.in_graph_mode():
ValueError: Can't load save_path when it is None.
The error just means tf.train.latest_checkpoint didn't find anything. It returns None, then the Saver complains because it was passed None. So there's no checkpoint in that directory.