Keras - Theano - Test for division by zero - theano

I have a Layer that computes the mean of timesteps and supports masking.
My problem is that there may be the case that the mask is empty (no padded timesteps) but i don't know how to check for zeros when working with tensors.
I have a few training examples for which the mask is empty so i get a NaN loss and the program crashes.
This is my Layer:
class MeanOverTime(Layer):
def __init__(self, **kwargs):
self.supports_masking = True
super(MeanOverTime, self).__init__(**kwargs)
def call(self, x, mask=None):
if mask is not None:
return K.cast(x.sum(axis=1) / mask.sum(axis=1, keepdims=True), K.floatx()) # this may result to division by zero
else:
return K.mean(x, axis=1)
def get_output_shape_for(self, input_shape):
return input_shape[0], input_shape[-1]
def compute_mask(self, input, input_mask=None):
return None
This mask.sum(axis=1, keepdims=True) becomes zero. In order to bypass this i have increased the input_length so it covers all my training examples, but this is not a solution. Also i tried adding a try/except but this also didn't work.

try/except wont work because all this piece of code does is create the symbolic tensor graph which has no exception .. the evaluation hence the division by 0 happens in the fit/evaluate/predict function. You need to include the logic/decision in the symbolic graph.
You can use switch(condition, then_expression, else_expression) to include if and else:
def call(self, x, mask=None):
if mask is not None:
sum = mask.sum(axis=1, keepdims=True)
cond = K.equal(sum,0)
_the_other_tensor_ = ....
div = K.switch(cond, _the_other_tensor_ ,sum)
return K.cast(x.sum(axis=1) / div, K.floatx()) # this may result to division by zero
else:
return K.mean(x, axis=1)
Or just use clip(x, min_value, max_value) to clip with a very small number epsilon to make the division numerically stable.
def call(self, x, mask=None):
if mask is not None:
sum = mask.sum(axis=1, keepdims=True)
div = K.clip(sum, K.epsilon, 1)
return K.cast(x.sum(axis=1) / div, K.floatx()) # this may result to division by zero
else:
return K.mean(x, axis=1)

Related

How to load multiple .mat files each with its own sequence in pytorch

so i have multiple files in my directory that begin with either P or C. I am trying to train a RNN to predict values of C given a sequence of P
Now each file has a signal. I will break the signal into smaller part each with dimension (sequence length, 1) as there is only feature. Ideally my output dimension should be something like (num_batches, batch_size, seq_length, features). However as i have multiple files, i get something like (num_files,num_batches, batch_size, seq_length, features)
Here's my code
class MyDataset(Dataset):
def __init__(self, PATH, seq_length):
self.seq_length=seq_length
self.c_paths=[]
self.p_paths=[]
for i in os.scandir(PATH):
name=i.name
if name.split('.')[-1] == 'mat':
file_name = name.split('.')[0]
if 'C' in file_name:
self.c_paths.append(i.path)
if 'P' in file_name:
self.p_paths.append(i.path)
def __getitem__(self, index):
p_noise = sio.loadmat(self.p_paths[index])['P_noise']
cm = sio.loadmat(self.c_paths[index])['Cm']
inputs=[]
outputs=[]
start=0
for j in range (len(p_noise) - self.seq_length):
stop = start + self.seq_length
input = p_noise[start:stop]
output = cm[stop-1]
start += 1
inputs.append(input)
outputs.append(output)
inputs = torch.from_numpy(np.array(inputs).reshape((-1, self.seq_length,1)))
outputs= torch.from_numpy(np.array(outputs).reshape((-1, 1)))
self.x=inputs
self.y=outputs
return self.x, self.y
def __len__(self):
return len(self.c_paths)
Heres the output
PATH='Dataset'
dataset=MyDataset(PATH, seq_length=400)
dataloader = DataLoader(dataset=dataset, batch_size=2, shuffle=False)
datatiter=iter(dataloader)
data=datatiter.next()
x,y=data
x.shape, y.shape

tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [100,200] vs. [100,10,200]

The shape of the tensor input to my model is(None, 10, 256),after processing by the attention layer, the shape becomes(None, 256),How should I modify layercompute_output_shape(self, input_shape) so that the shape of the model does not change?
attention layer
class Attention_layer(Layer):
def __init__(self,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
super(Attention_layer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(name='att_weight',shape=(input_shape[-1], input_shape[-1],),
initializer=self.init,
regularizer=self.W_regularizer,
constraint=self.W_constraint
)
if self.bias:
self.b = self.add_weight((input_shape[-1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
super(Attention_layer, self).build(input_shape)
def compute_mask(self, input, input_mask=None):#build(input_shape):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):#call(x):
uit = K.dot(x, self.W)
if self.bias:
uit += self.b
uit = K.tanh(uit)
a = K.exp(uit)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number to the sum.
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
# a = K.expand_dims(a)
weighted_input = x * a
print(weighted_input)
return K.sum(weighted_input, axis=1)#output.shape = (batch_size, embedding_size)
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]

Regression tree - SSR Impurity measure

i have a code of classification desicion tree (classification), and
i'm trying to convert it to a regression tree.
i understand that i need to change the Impurity measure.
in classification i have the Gini and Entropy.
in regression i need to use SSR.
if i understand right, i need to change the information_gain function for calculating the SSR.
can someone help me understand how should i change it?
class DecisionTreeClassifier():
def __init__(self, min_samples_split=2, max_depth=2):
''' constructor '''
# the root of the tree
self.root = None
# stopping conditions
# if the num of samples became less then min sample we will stop and it will be a leaf.
# same with depth
self.min_samples_split = min_samples_split
self.max_depth = max_depth
def build_tree(self, dataset, curr_depth=0):
''' recursive function to build the tree '''
#splitting the features and target
X, Y = dataset[:,:-1], dataset[:,-1]
num_samples, num_features = np.shape(X)
# split until stopping conditions are met
if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
# find the best split
best_split = self.get_best_split(dataset, num_samples, num_features)
# check if information gain is positive, if it eq to 0 it means its pure
if best_split["info_gain"]>0:
# recursive left
left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
# recursive right
right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
# return decision node
return Node(best_split["feature_index"], best_split["threshold"],
left_subtree, right_subtree, best_split["info_gain"])
# calculate leaf node
leaf_value = self.calculate_leaf_value(Y)
# return leaf node
return Node(value=leaf_value)
def get_best_split(self, dataset, num_samples, num_features):
''' function to find the best split '''
# dictionary to store the best split
best_split = {}
#we want to maximize the and to find that we have to use a number that less then any other number
max_info_gain = -float("inf")
# loop over all the features
for feature_index in range(num_features):
feature_values = dataset[:, feature_index]
# return the unique values of particular feature
possible_thresholds = np.unique(feature_values)
# loop over all the feature values present in the data
for threshold in possible_thresholds:
# get current split
dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
# check if childs are not null
if len(dataset_left)>0 and len(dataset_right)>0:
#getting the target values
y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
# y = target values
# compute information gain
curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
# once we get the current information gain we need the check if the currentinformation gain
#bigger then the max information gain if yes ? we need to update oyr best split
if curr_info_gain>max_info_gain:
best_split["feature_index"] = feature_index
best_split["threshold"] = threshold
best_split["dataset_left"] = dataset_left
best_split["dataset_right"] = dataset_right
best_split["info_gain"] = curr_info_gain
max_info_gain = curr_info_gain
# return best split
return best_split
def split(self, dataset, feature_index, threshold):
''' function to split the data '''
# takes the dataset and the feature index and the threshold value and split it to two parts ( left and right child)
# we will split with <> threshold
dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
return dataset_left, dataset_right
def information_gain(self, parent, l_child, r_child, mode="gini"):
''' function to compute information gain '''
# calculate the weights. child/parent
weight_l = len(l_child) / len(parent)
weight_r = len(r_child) / len(parent)
# calculate the Gini
if mode=="gini":
gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
else:
gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
return gain
# for that home work we do not need entropy but nice to have
'''def entropy(self, y):
# function to compute entropy
class_labels = np.unique(y)
entropy = 0
for cls in class_labels:
p_cls = len(y[y == cls]) / len(y)
entropy += -p_cls * np.log2(p_cls)
return entropy'''
def gini_index(self, y):
''' function to compute gini index '''
class_labels = np.unique(y)
gini = 0
for cls in class_labels:
p_cls = len(y[y == cls]) / len(y)
gini += p_cls**2
return 1 - gini
def calculate_leaf_value(self, Y):
''' function to compute leaf node '''
# find the most occuring element in Y
Y = list(Y)
return max(Y, key=Y.count)
def print_tree(self, tree=None, indent=" "):
''' recursive function to print the tree '''
if not tree:
tree = self.root
if tree.value is not None:
print(tree.value)
else:
print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
print("%sleft:" % (indent), end="")
self.print_tree(tree.left, indent + indent)
print("%sright:" % (indent), end="")
self.print_tree(tree.right, indent + indent)
def fit(self, X, Y):
''' function to train the tree '''
dataset = np.concatenate((X, Y), axis=1)
self.root = self.build_tree(dataset)
def predict(self, X):
''' function to predict new dataset '''
preditions = [self.make_prediction(x, self.root) for x in X]
return preditions
def make_prediction(self, x, tree):
''' function to predict a single data point '''
if tree.value!=None: return tree.value
feature_val = x[tree.feature_index]
if feature_val<=tree.threshold:
return self.make_prediction(x, tree.left)
else:
return self.make_prediction(x, tree.right)

Has anyone written weldon pooling for keras?

Has the Weldon pooling [1] been implemented in Keras?
I can see that it has been implemented in pytorch by the authors [2] but cannot find a keras equivalent.
[1] T. Durand, N. Thome, and M. Cord. Weldon: Weakly su-
pervised learning of deep convolutional neural networks. In
CVPR, 2016.
[2] https://github.com/durandtibo/weldon.resnet.pytorch/tree/master/weldon
Here is one based on the lua version (there is a pytorch impl but i think that has an error taking the average of max+min). I'm assuming the lua version's avg of top max and min values was still correct. I've not tested the whole custom layer aspects but close enough to get something going, comments welcomed.
Tony
class WeldonPooling(Layer):
"""Class to implement Weldon selective spacial pooling with negative evidence
"""
##interfaces.legacy_global_pooling_support
def __init__(self, kmax, kmin=-1, data_format=None, **kwargs):
super(WeldonPooling, self).__init__(**kwargs)
self.data_format = conv_utils.normalize_data_format(data_format)
self.input_spec = InputSpec(ndim=4)
self.kmax=kmax
self.kmin=kmin
def compute_output_shape(self, input_shape):
if self.data_format == 'channels_last':
return (input_shape[0], input_shape[3])
else:
return (input_shape[0], input_shape[1])
def get_config(self):
config = {'data_format': self.data_format}
base_config = super(_GlobalPooling2D, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
if self.data_format == "channels_last":
inputs = tf.transpose(inputs, [0, 3, 1, 2])
kmax=self.kmax
kmin=self.kmin
shape=tf.shape(inputs)
batch_size = shape[0]
num_channels = shape[1]
h = shape[2]
w = shape[3]
n = h * w
view = tf.reshape(inputs, [batch_size, num_channels, n])
sorted, indices = tf.nn.top_k(view, n, sorted=True)
#indices_max = tf.slice(indices,[0,0,0],[batch_size, num_channels, kmax])
output = tf.div(tf.reduce_sum(tf.slice(sorted,[0,0,0],[batch_size, num_channels, kmax]),2),kmax)
if kmin > 0:
#indices_min = tf.slice(indices,[0,0, n-kmin],[batch_size, num_channels, kmin])
output=tf.add(output,tf.div(tf.reduce_sum(tf.slice(sorted,[0,0,n-kmin],[batch_size, num_channels, kmin]),2),kmin))
return tf.reshape(output,[batch_size, num_channels])

setting up MCMC with log-likelihood and log-normal prior with PyMC

I am a newbie with pyMC and I am not still able to construct the structure of my MCMC with pyMC. I would like to establish a chain and I am confused how to define my parameters and log-likelihood function together. My chi-squared function is given by:
where and are observational data and correspondence error respectively and is the model with four free parameter and the parameters are non-linear.
The prior for X and Y are uniform like:
import pymc as pm
import numpy as np
import math
import random
#pm.stochastic(dtype=np.float, observed=False, trace=True)
def Xpos(value=1900,x_l=1851,x_h=1962):
"""The probable region of the position of halo centre"""
def logp(value,x_l,x_h):
if ((value>x_h) or (value<x_l)):
return -np.inf
else:
return -np.log(x_h-x_l+1)
def random(x_l,x_h):
return np.round((x_h-x_l)*random.random())+x_l
#pm.stochastic(dtype=np.float, observed=False, trace=True)
def Ypos(value=1900,y_l=1851,y_h=1962):
"""The probable region of the position of halo centre"""
def logp(value,y_l,y_h):
if ((value>y_h) or (value<y_l)):
return -np.inf
else:
return -np.log(y_h-y_l+1)
def random(y_l,y_h):
return np.round((y_h-y_l)*random.random())+y_l
but for M and C are given as following:
where the mean of C is computed via
For M and C, the priors should look like this:
M=math.pow(10,15)*pm.Exponential('mass', beta=math.pow(10,15))
#pm.stochastic(dtype=np.float, observed=False, trace=True)
def concentration(value=4, zh, M200):
"""logp for concentration parameter"""
def logp(value=4.,zh, M200):
if (value>0):
x = np.linspace(math.pow(10,13),math.pow(10,16),200 )
prob=expon.pdf(x,loc=0,scale=math.pow(10,15))
conc = [5.26/(1.+zh)*math.pow(x[i]/math.pow(10,14),-0.1) for i in range(len(x))]
mu_c=0
for i in range(len(x)):
mu_c+=prob[i]*conc[i]/sum(prob)
if (M200 < pow(10,15)):
tau=1./(0.09*0.09)
else:
tau=1./(0.06*0.06)
return pm.lognormal_like(value, mu_c, tau)
else
return -np.inf
def random(mu_c,tau):
return np.random.lognormal(mu_c, tau, 1)
The parameter z is also a constant in C prior. I am wondering how I could define my likelihood for , and should it be referred as #Deterministic variable? Have I defined M and C as priori information in a correct way or not?
I will be grateful if somebody gives me some tips that how I can combine these parameters with given priors.
#priors
#pm.stochastic(dtype=np.float, observed=False, trace=True)
def Xpos(value=1900,x_l=1800,x_h=1950):
"""The probable region of the position of halo centre"""
if ((value>x_h) or (value<x_l)):
return -np.inf
else:
return -np.log(x_h-x_l+1)
#pm.stochastic(dtype=np.float, observed=False, trace=True)
def Ypos(value=1750,y_l=1200,y_h=2000):
"""The probable region of the position of halo centre"""
def logp(value,y_l,y_h):
if ((value>y_h) or (value<y_l)):
return -np.inf
else:
return -np.log(y_h-y_l+1)
M=math.pow(10,15)*pm.Exponential('mass', beta=math.pow(10,15))
#deterministic
def sigma(value = 1, M=M):
if M < 10**15:
return .09
else:
return .06
cExpected = 5.26/(1+z)*(M/math.pow(10,14))**(-.1) # based on Neto et al. 2007
concentration = Lognormal("concentration", cExpected, sigma)
#model
#pm.deterministic( name='reduced_shear', dtype=np.float, observed=False, trace = True )
def reduced_shear(x=Xpos,y=Ypos,mass=M,conc=concentration):
nfw = NFWHalo(mass,conc,zh=0.128,[x,y])
g1tot=0;g2tot=0
for i in range(len(z)):
g1,g2,magnification=nfw.getLensing( gal_pos, z[i])
g1tot+=g1*redshift_pdf[i]/sum(redshift_pdf)
g2tot+=g2*redshift_pdf[i]/sum(redshift_pdf)
theta=arctan2(gal_ypos - Ypos, gal_xpos - Xpos)
value=-g1tot*cos(2*theta)-g2tot*sin(2*theta) #tangential shear
return value
#pm.deterministic( name='reduced_shear', dtype=np.float, observed=False, trace = True )
def tau_shear(Xpos,Ypos,M,concentration):
nfw = NFWHalo(M,concentration,zh=0.128,[Xpos,Ypos])
g1tot=0;g2tot=0
for i in range(len(z)):
g1,g2,magnification=nfw.getLensing( gal_pos, z[i])
g1tot+=g1*redshift_pdf[i]/sum(redshift_pdf)
g2tot+=g2*redshift_pdf[i]/sum(redshift_pdf)
theta=arctan2(gal_ypos - Ypos, gal_xpos - Xpos)
gt=-g1tot*cos(2*theta)-g2tot*sin(2*theta)
g_squared=g1tot**2+g2tot**2
delta_abse=sqrt(delta_e1**2+delta_e1**22)
value=(1-g_squared)*delta_abse
return value
tau = pm.Normal('tau', tau_shear, 0.2)
#likelihood
obs = pm.Normal("obs", mu=reduced_shear, tau, value=data, observed=True)

Resources