I have a tensorflow version of GloVe:
Xij = tf.placeholder(tf.float32, shape=[None], name="Xij")
wordI = tf.placeholder(tf.int32, shape=[None], name="wordI")
wordIW = tf.Variable(tf.random_uniform([inputSize, embedSize], initRange, -initRange), name="wordIW")
wordIB = tf.Variable(tf.random_uniform([inputSize], initRange, -initRange), name="wordIB")
wi = tf.nn.embedding_lookup([wordIW], wordI, name="wi")
bi = tf.nn.embedding_lookup([wordIB], wordI, name="bi")
wordJ = tf.placeholder(tf.int32, shape=[None], name="wordJ")
wordJW = tf.Variable(tf.random_uniform([outputSize, embedSize], initRange, -initRange), name="wordJW")
wordJB = tf.Variable(tf.random_uniform([outputSize], initRange, -initRange), name="wordJB")
wj = tf.nn.embedding_lookup([wordJW], wordJ, name="wj")
bj = tf.nn.embedding_lookup([wordJB], wordJ, name="bj")
scalingFactor = tf.constant([scalingFactor], name="scalingFactor")
countMax = tf.constant([countMax], name="countMax")
wFactor = tf.minimum(1.0, tf.pow(tf.div(Xij, countMax), scalingFactor))
wiwjProduct = tf.reduce_sum(tf.multiply(wi, wj), 1)
logXij = tf.log(Xij)
dist = tf.square(tf.add_n([wiwjProduct, bi, bj, tf.negative(logXij)]))
loss = tf.reduce_sum(tf.multiply(wFactor, dist), name="loss")
tf.summary.scalar("GloVeLoss", loss)
global_step = tf.Variable(0, trainable=False, name="global_step")
learn_rate = tf.Variable(learn_rate, trainable=False, name="learn_rate")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learn_rate, name="optimizer").minimize(loss, global_step=global_step)
saver = tf.train.Saver()
I would like to add a new input (increase input size of 1 and add the corresponding rows in wordIW and wordIB). I can't find a proper way to modify the existing graph.
ps: my goal is to stop the gradient on existing variables and optimize the loss only on the new input.
Related
Tuning is done as follows:
tuner = kt.RandomSearch(
MyHyperModel(),
objective="mae",
max_trials=30,
overwrite=True,
directory=results_dir,
project_name="tune_hypermodel",
)
And I'm iterating over how many features to use:
data[name]=pd.read_pickle(os.path.join(root, name)+'/'+name+'.data.reindexed_by_pc.pkl')
CpG_num_lst=[100,500,1000,5000,10000,20000,40000]
train_score, valid_score = [], []
hps_by_CpG_num = []
for CpG_num in CpG_num_lst:
print("CpG_num:",CpG_num)
# force overwrite tune search (to start new search). Cause even with overwrite=True it doesn't overwrite
if os.path.exists(results_dir+'/tune_hypermodel'):
shutil.rmtree(results_dir+'/tune_hypermodel')
# initialize
X1, y1, X2, y2 = [dict() for _ in range(4)]
X1[name] = data[name][fold1_ids].head(CpG_num).values.T
X2[name] = data[name][fold2_ids].head(CpG_num).values.T
# get the ages of the corresponding persons. Notice info_1 and info_2 only contain "Ctrl" and not "Test" samples
y1[name] = info_1[name].query("`Train.Test`=='Train'")['Age'].values.astype(float)
y2[name] = info_2[name].query("`Train.Test`=='Train'")['Age'].values.astype(float)
# Split the data
X1_train, X1_valid, y1_train, y1_valid = train_test_split(X1[name], y1[name], test_size=0.2, shuffle= True)
# Grid search
tuner.search(X1_train, y1_train, validation_data = (X1_valid,y1_valid))
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
# Get best hyperparameters
hp_dict = dict()
hp_dict['num_layers'] = best_hps.get('num_layers')
hp_dict['batch_size'] = best_hps.get('batch_size')
hp_dict['act_1'] = best_hps.get('act_1')
hp_dict['act_2'] = best_hps.get('act_2')
hp_dict['units_1'] = best_hps.get('units_1')
hp_dict['units_2'] = best_hps.get('units_2')
hps_by_CpG_num.append(hp_dict)
# Build best model
best_model = MyHyperModel().build(hp=best_hps)
history = best_model.fit(X1_train, y1_train, validation_data = (X1_valid,y1_valid), batch_size=best_hps.get('batch_size'), epochs=200)
However for each element in the for loop the tuner search doesn't restart, it just uses the best hyperparameters from the first search (CpG_num = 500)
What am I missing? Why is Keras taking old hyperparameters?
The solution was to include the tuner instantiation within the for loop.
tuner = kt.RandomSearch(
MyHyperModel(),
objective="mae",
max_trials=30,
overwrite=True,
directory=results_dir,
project_name="tune_hypermodel",
)
Not sure why though, but works... If somebody has any insight on this let me know. I thought the tuner would be overwritten.
Is there a better way of doing this?
I Used SMOTE and Tomek methods for imbalanced classes that I have. I'm trying to do boosted regression tree.
It runs smoothly until I create the confusion matrix I have this error (
Error: data and reference should be factors with the same levels.
### SMOTE and Tomek
NOAA_SMOTE= read.csv("NOAA_SMOTE.csv", TRUE, ",")
train.index <- createDataPartition(NOAA_SMOTE$japon, p = .7, list = FALSE)
train <- NOAA_SMOTE[ train.index,]
test <- NOAA_SMOTE[-train.index,]
tomek = ubTomek(train[,-1], train[,1])
model_train_tomek = cbind(tomek$X,tomek$Y)
names(model_train_tomek)[1] = "japon"
removed.index = tomek$id.rm
train$japon = as.factor(train$japon)
train_tomek = train[-removed.index,]
## SMOTE after tomek links
traintomeksmote <- SMOTE(japon ~ ., train_tomek, perc.over = 2000,perc.under = 100)
fitControlSmoteTomek<- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 3,
## Estimate class probabilities
classProbs = TRUE,
## Evaluate performance using
## the following function
summaryFunction = twoClassSummary)
gbmGridSmoteTomek <- expand.grid(interaction.depth = c(3,4, 5, 6),
n.trees = (1:30)*50,
shrinkage = c(0.1,0.001,0.75,0.0001),
n.minobsinnode = 10)
gbmFitNOAASMOTETomek <- caret::train (make.names(japon) ~ ., data = traintomeksmote,
method = "gbm",
trControl = fitControlSmoteTomek,
distribution = "bernoulli",
verbose = FALSE,
tuneGrid = gbmGridSmoteTomek,
bag.fraction=0.5,
## Specify which metric to optimize
metric = "ROC")
test$japon = as.factor(test$japon)
PredNOAASMOTETomek <- predict(gbmFitNOAASMOTETomek, newdata= test ,type='prob')
cmSMOTETomekNOAA = confusionMatrix(PredNOAASMOTETomek , as.factor(test$japon), mode="everything")
part of the data
[enter image description here](https://i.stack.imgur.com/jPgI9.png)
Many of the Pytorch examples use the Dataset map() method. For example:
https://huggingface.co/voidful/wav2vec2-large-xlsr-53-tw-gpt
ds = load_dataset("common_voice", 'zh-TW', split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
def map_to_array(batch):
audio = batch["audio"]
batch["speech"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
batch["sampling_rate"] = audio["sampling_rate"]
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")
return batch
ds = ds.map(map_to_array)
def map_to_pred(batch):
features = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], padding=True, return_tensors="pt")
input_values = features.input_values.to(device)
attention_mask = features.attention_mask.to(device)
with torch.no_grad():
logits = model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
batch["predicted"] = processor.batch_decode(pred_ids)
batch["target"] = batch["sentence"]
return batch
result = ds.map(map_to_pred, batched=True, batch_size=3, remove_columns=list(ds.features.keys()))
However, implementing a custom Map style dataset only requires __len__() and __getitem__()
What is the correct way to convert a custom Dataset into one with all the useful methods needed by the examples?
I am trying to follow a kaggle kernel for BERT implementation :
https://www.kaggle.com/hiromoon166/save-bert-fine-tuning-model
But i am not able to select target variables. I have to select multiple target variables as my y variable as it is a multi-label classification.
This is the line of code i am stuck on:
train_lines, train_labels = train_df['comment_text'].values, train_df.target.values
def convert_lines(example, max_seq_length,tokenizer):
max_seq_length -=2
all_tokens = []
longer = 0
for i in range(example.shape[0]):
tokens_a = tokenizer.tokenize(example[i])
if len(tokens_a)>max_seq_length:
tokens_a = tokens_a[:max_seq_length]
longer += 1
one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
all_tokens.append(one_token)
print(longer)
return np.array(all_tokens)
nb_epochs=1
bsz = 32
dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
tokenizer = tokenization.FullTokenizer(vocab_file=dict_path, do_lower_case=True)
print('build tokenizer done')
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
train_df = train_df.sample(frac=0.01,random_state = 42)
#train_df['comment_text'] = train_df['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)
train_lines, train_labels = train_df['comment_text'].values, train_df.target.values
print('sample used',train_lines.shape)
token_input = convert_lines(train_lines,maxlen,tokenizer)
seg_input = np.zeros((token_input.shape[0],maxlen))
mask_input = np.ones((token_input.shape[0],maxlen))
print(token_input.shape)
print(seg_input.shape)
print(mask_input.shape)
print('begin training')
model3.fit([token_input, seg_input, mask_input],train_labels,batch_size=bsz,epochs=nb_epochs)
Please help me understand how to select target variables here?
I am trying to implement the logic behind k-cross validation without the use of library on a test matrix.Somehow , my rotated matrices are not working fine.
I have taken k to be 5.
X = np.matrix([[1,2,3,4,5],[7,8,9,4,5],[4,9,6,4,2],[9,5,1,2,3],[7,5,3,4,6]])
P = np.ones((5,5))
target = np.matrix([[1,2,3,4,5]]).T
#def k_fold(X,target,k):
r = X.shape[0]
k=5
step = r//k
last_row_train = step*(k-1)
for i in range(5):
X_train = X[0:last_row_train,:]
tempX = X_train
X_test = X[last_row_train:r,:]
temp_X_test = X_test
t_train = target[0:last_row_train,:]
temp_t_train = t_train
t_test = target[last_row_train:r,:]
temp_test = t_test
X[step:r,:] = tempX # On running this line, it changes the value of
# temp_X_test which is very weird and not
# supposed to happen
X[0:step,:] = temp_X_test
target[0:step,:] = temp_test
target[step:r,:] = temp_t_train
print (X)
print (target)
tempX = X_train
This statement does not create a new variable tempX and assign X_train to it. It makes both the variable names tempX and X_train point to the same object. Any change in tempX will be reflected in X_train. This is a recurring problem in your code.
When trying to make a list assignment like that use the code below.
tempX = X_train.copy()
Here's a link to a similar question with more solutions.
How to clone or copy a list?