How to combine Scikit Learn's GroupKFold and StratifieKFold - scikit-learn

I am working with an imbalanced data set that has multiple observations from the same set of users. I want to make sure that I don't have the same users in both the training and test sets while still maintaining the original distribution as much as possible.
I have been trying to combine the GroupKFold and StratifiedKFold functions from Sklearn but I'm kind of at a loss how to do it. Does anyone have any ideas of how I could combine these two functions?

def stratified_group_k_fold(X, y, groups, k, seed=None):
"""Source: https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation """
labels_num = np.max(y) + 1
y_counts_per_group = collections.defaultdict(lambda: np.zeros(labels_num))
y_distr = collections.Counter()
for label, g in zip(y, groups):
y_counts_per_group[g][label] += 1
y_distr[label] += 1
y_counts_per_fold = collections.defaultdict(lambda: np.zeros(labels_num))
groups_per_fold = collections.defaultdict(set)
def eval_y_counts_per_fold(y_counts, fold):
y_counts_per_fold[fold] += y_counts
std_per_label = []
for label in range(labels_num):
label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
std_per_label.append(label_std)
y_counts_per_fold[fold] -= y_counts
return np.mean(std_per_label)
groups_and_y_counts = list(y_counts_per_group.items())
random.Random(seed).shuffle(groups_and_y_counts)
for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
best_fold = None
min_eval = None
for i in range(k):
fold_eval = eval_y_counts_per_fold(y_counts, i)
if min_eval is None or fold_eval < min_eval:
min_eval = fold_eval
best_fold = i
y_counts_per_fold[best_fold] += y_counts
groups_per_fold[best_fold].add(g)
all_groups = set(groups)
for i in range(k):
train_groups = all_groups - groups_per_fold[i]
test_groups = groups_per_fold[i]
train_indices = [i for i, g in enumerate(groups) if g in train_groups]
test_indices = [i for i, g in enumerate(groups) if g in test_groups]
yield train_indices, test_indices

Related

How to remove inplace operation error in Pytorch?

I get this error from the following Pytorch code:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [3]] is at version 10; expected version 9 instead.
As it is seen the code does not have inplace operations.
import torch
device = torch.device('cpu')
class MesNet(torch.nn.Module):
def __init__(self):
super(MesNet, self).__init__()
self.cov_lin = torch.nn.Sequential(torch.nn.Linear(6, 5)).double()
def forward(self, u):
z_cov = self.cov_lin(u.transpose(0, 2).squeeze(-1))
return z_cov
class UpdateModel(torch.nn.Module):
def __init__(self):
torch.nn.Module.__init__(self)
self.P_dim = 18
self.Id3 = torch.eye(3).double()
def run_KF(self):
N = 10
u = torch.randn(N, 6).double()
v = torch.zeros(N, 3).double()
model = MesNet()
measurements_covs_l = model(u.t().unsqueeze(0))
# remember to remove this afterwards
torch.autograd.set_detect_anomaly(True)
for i in range(1, N):
v[i] = self.update_pos(v[i].detach(), measurements_covs_l[i-1])
criterion = torch.nn.MSELoss(reduction="sum")
targ = torch.rand(10, 3).double()
loss = criterion(v, targ)
loss = torch.mean(loss)
loss.backward()
return v, p
def update_pos(self, v, measurement_cov):
Omega = torch.eye(3).double()
H = torch.ones((5, self.P_dim)).double()
R = torch.diag(measurement_cov)
Kt = H.t().mm(torch.inverse(R))
# it is indicating inplace error even with this:
# Kt = H.t().mm(R)
dx = Kt.mv(torch.ones(5).double())
dR = self.trans(dx[:9].clone())
v_up = dR.mv(v)
return v_up
def trans(self, xi):
phi = xi[:3].clone()
angle = torch.norm(phi.clone())
if angle.abs().lt(1e-10):
skew_phi = torch.eye(3).double()
J = self.Id3 + 0.5 * skew_phi
Rot = self.Id3 + skew_phi
else:
axis = phi / angle
skew_axis = torch.eye(3).double()
s = torch.sin(angle)
c = torch.cos(angle)
Rot = c * self.Id3
return Rot
net = UpdateModel()
net.run_KF()
I think the issue is that you are overwriting v[i] elements.
You could instead construct an auxiliary list v_ from the loop, then convert it tensor:
v_ = [v[0]]
for i in range(1, N):
v_.append(self.update_pos(v[i].detach(), measurements_covs_l[i-1]))
v = torch.stack(v_)

convert Tensorflow1 to Tensorflow 2

there is a code written with tensorflow1 on this link.
https://github.com/carlthome/tensorflow-convlstm-cell/blob/master/cell.py
I want to use this class as a layer in TensorFlow.Keras. So it should be written with TensorFlow version 2.
How can do it?
this is this code:
import tensorflow as tf
class ConvLSTMCell(tf.nn.rnn_cell.RNNCell):
"""A LSTM cell with convolutions instead of multiplications.
Reference:
Xingjian, S. H. I., et al. "Convolutional LSTM network: A machine learning approach for precipitation nowcasting." Advances in Neural Information Processing Systems. 2015.
"""
def __init__(self, shape, filters, kernel, forget_bias=1.0, activation=tf.tanh, normalize=True, peephole=True, data_format='channels_last', reuse=None):
super(ConvLSTMCell, self).__init__(_reuse=reuse)
self._kernel = kernel
self._filters = filters
self._forget_bias = forget_bias
self._activation = activation
self._normalize = normalize
self._peephole = peephole
if data_format == 'channels_last':
self._size = tf.TensorShape(shape + [self._filters])
self._feature_axis = self._size.ndims
self._data_format = None
elif data_format == 'channels_first':
self._size = tf.TensorShape([self._filters] + shape)
self._feature_axis = 0
self._data_format = 'NC'
else:
raise ValueError('Unknown data_format')
#property
def state_size(self):
return tf.nn.rnn_cell.LSTMStateTuple(self._size, self._size)
#property
def output_size(self):
return self._size
def call(self, x, state):
c, h = state
x = tf.concat([x, h], axis=self._feature_axis)
n = x.shape[-1].value
m = 4 * self._filters if self._filters > 1 else 4
W = tf.get_variable('kernel', self._kernel + [n, m])
y = tf.nn.convolution(x, W, 'SAME', data_format=self._data_format)
if not self._normalize:
y += tf.get_variable('bias', [m], initializer=tf.zeros_initializer())
j, i, f, o = tf.split(y, 4, axis=self._feature_axis)
if self._peephole:
i += tf.get_variable('W_ci', c.shape[1:]) * c
f += tf.get_variable('W_cf', c.shape[1:]) * c
if self._normalize:
j = tf.contrib.layers.layer_norm(j)
i = tf.contrib.layers.layer_norm(i)
f = tf.contrib.layers.layer_norm(f)
f = tf.sigmoid(f + self._forget_bias)
i = tf.sigmoid(i)
c = c * f + i * self._activation(j)
if self._peephole:
o += tf.get_variable('W_co', c.shape[1:]) * c
if self._normalize:
o = tf.contrib.layers.layer_norm(o)
c = tf.contrib.layers.layer_norm(c)
o = tf.sigmoid(o)
h = o * self._activation(c)
state = tf.nn.rnn_cell.LSTMStateTuple(c, h)
return h, state

please let me know why i get this error UnboundLocalError: local variable 'top_performer' referenced before assignment

Here I have my Python code code, I don't understand why I am getting the following error. Any guidance or help would be much appreciated.
UnboundLocalError: local variable 'top_performer' referenced before assignment
def create(X, y, **kwargs):
method = kwargs.get("method", None)
#method = kwargs.get("method", "Binary_operators")
#method = kwargs.get("method", "Binning")
#method = kwargs.pop("method", "Cluster")
#categorical_cols = [c for c, t in zip(X.columns, X_column_types) if t in [DATATYPE_CATEGORY_INT, DATATYPE_CATEGORY_STRING]]
#numerical_cols = [c for c, t in zip(X.columns, X_column_types) if t == DATATYPE_NUMBER]
#categorical = X[categorical_cols]
#numerical = X[numerical_cols]
categorical = X.select_dtypes(include=[object])
numerical = X.select_dtypes(exclude=[object])
# feature selection using Genetic Algorithm
if method == "fs_GA":
print("fs_GA")
enc = OneHotEncoder()
enc.fit(categorical)
Data_cat=pd.DataFrame(enc.transform(categorical).toarray())
X_data = pd.concat([numerical, Data_cat], axis=1)
if y.dtype == int:
y = y
else:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_data, y, train_size=0.8, random_state=42)
def get_fitness(individual):
if y.dtype == int:
rg = RandomForestRegressor(random_state=42)
else:
rg = RandomForestClassifier(random_state=42)
columns = [column for (column, binary_value) in zip(X_train.columns, individual) if binary_value]
training_set = X_train[columns]
test_set = X_test[columns]
rg.fit(training_set.values, y_train)
preds = rg.predict(test_set.values)
return 100 / np.sqrt(mean_squared_error(y_test, preds))
individual = [1] * 100
get_fitness(individual)
def get_population_fitness(population):
return sorted([(individual, get_fitness(individual)) for individual in population], key=lambda tup: tup[1], reverse=True)
def crossover(individual_a, individual_b):
crossing_point = random.randint(0, 99)
offspring_a = individual_a[0:crossing_point] + individual_b[crossing_point:100]
offspring_b = individual_b[0:crossing_point] + individual_a[crossing_point:100]
return offspring_a, offspring_b
def tournament(current_population):
index = sorted(random.sample(range(0, 20), 5))
tournament_members = [current_population[i] for i in index]
total_fitness = sum([individual[1] for individual in tournament_members])
probabilities = [individual[1] / total_fitness for individual in tournament_members]
index_a, index_b = np.random.choice(5, size=2, p=probabilities)
return crossover(tournament_members[index_a][0], tournament_members[index_b][0])
def mutation(individual):
mutation_point = random.randint(0, 99)
if(individual[mutation_point]):
individual[mutation_point] = 0
else:
individual[mutation_point] = 1
def build_next_generation(current_population, mutation_rate):
next_generation = []
next_generation.append(current_population[0][0]) # elitism
next_generation.append(current_population[random.randint(1,19)][0]) # randomness
for i in range(9): # tournaments
offspring_a, offspring_b = tournament(current_population)
next_generation.append(offspring_a)
next_generation.append(offspring_b)
for individual in next_generation: # mutation
if(random.randint(1,mutation_rate) == 1):
mutation(individual)
return next_generation
def run_ga(current_population, num_of_generations, mutation_rate=1000):
fittest_individuals = []
for i in range(num_of_generations):
current_population = get_population_fitness(current_population) # get pop fitness
fittest_individuals.append(current_population[0]) # record fittest individual (for graphing and analysis)
current_population = build_next_generation(current_population, mutation_rate) # make new population
return fittest_individuals
initial_population = [[random.randint(0, 1) for i in range(100)] for i in range(20)]
high_mutation_fittest = run_ga(initial_population, 100, mutation_rate=5)
high_mutation_fitness = [ind[1] for ind in high_mutation_fittest]
for item in high_mutation_fittest[:-1]:
if item[1] == max(high_mutation_fitness):
top_performer = item
break
print("Total features included: " + str(top_performer[0].count(1)))
selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
excluded_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if not binary_value]
X = X[selected_features]
if method == "Binary_operators":
print("binaryoperators")
if method == "Binning":
print("binning")
else:
print("Discretization")
if method == "Cluster":
print("clustering")
else:
print("no-cluster")
print("normal_autocross")
So when I run the code I get the following error and I don't seem to understand what it means. Can someone please explain to me why i'm getting this error?
create(X, y, method="fs_GA")
fs_GA
UnboundLocalError Traceback (most recent call last)
in
----> 1 create(X, y, method="fs_GA")
in create(X, y, **kwargs)
107 top_performer = item
108 break
--> 109 print("Total features included: " + str(top_performer[0].count(1)))
110
111 selected_features = [column for (column, binary_value) in zip(X.columns, top_performer[0]) if binary_value]
UnboundLocalError: local variable 'top_performer' referenced before assignment
top_performer = 0
for item in high_mutation_fittest[:-1]:
if item[1] == max(high_mutation_fitness):
top_performer = item
break
print("Total features included: " + str(top_performer[0].count(1)))
According to your code top_performer is an int variable, not an array, str(top_performer) is correct way of using it. str(top_performer).count('1') , this could be what you are looking for. count is for string not int

PyMC3- Custom theano Op to do numerical integration

I am using PyMC3 for parameter estimation using a particular likelihood function which has to be defined. I googled it and found out that I should use the densitydist method for implementing the user defined likelihood functions but it is not working. How to incorporate a user defined likelihood function in PyMC3 and to find out the maximum a posteriori (MAP) estimate for my model? My code is given below. Here L is the analytic form of my Likelihood function. I have some observational data for the radial velocity(vr) and postion (r) for some objects, which is imported from excel file.
data_ = np.array(pandas.read_excel('aaa.xlsx',header=None))
gamma=3.77;
G = 4.302*10**-6;
rmin = 3.0;
R = 95.7;
vr=data_[:,1];
r= data_[:,0];
h= np.pi;
class integrateOut(theano.Op):
def __init__(self,f,t,t0,tf,*args,**kwargs):
super(integrateOut,self).__init__()
self.f = f
self.t = t
self.t0 = t0
self.tf = tf
def make_node(self,*inputs):
self.fvars=list(inputs)
try:
self.gradF = tt.grad(self.f,self.fvars)
except:
self.gradF = None
return theano.Apply(self,self.fvars,[tt.dscalar().type()])
def perform(self,node, inputs, output_storage):
args = tuple(inputs)
f = theano.function([self.t]+self.fvars,self.f)
output_storage[0][0] = quad(f,self.t0,self.tf,args=args)[0]
def grad(self,inputs,grads):
return [integrateOut(g,self.t,self.t0,self.tf)(*inputs)*grads[0] \
for g in self.gradF]
basic_model = pm.Model()
with basic_model:
M=[]
beta=[]
interval=0.01*10**12
M=pm.Uniform('M',
lower=0.5*10**12,upper=3.50*10**12,transform='interval')
beta=pm.Uniform('beta',lower=2.001,upper=2.999,transform='interval')
gamma=3.77
logp=[]
arr=[]
vnew=[]
rnew=[]
theta = tt.scalar('theta')
beta = tt.scalar('beta')
z = tt.cos(theta)**(2*( (gamma/(beta - 2)) - 3/2) + 3)
intZ = integrateOut(z,theta,-(np.pi)/2,(np.pi)/2)(beta)
gradIntZ = tt.grad(intZ,[beta])
funcIntZ = theano.function([beta],intZ)
funcGradIntZ = theano.function([beta],gradIntZ)
for j in np.arange(0,59,1):
vnew.append(vr[j]+(0.05*vr[j]*float(dm.Decimal(rm.randrange(1,
20))/10)));
rnew.append(r[j]+(0.05*r[j]*float(dm.Decimal(rm.randrange(1,
20))/10)));
vn=np.array(vnew)
rn=np.array(rnew)
for beta in np.arange (2.01,2.99,0.01):
for M in np.arange (0.5,2.50,0.01):
i=np.arange(0,59,1)
q =( gamma/(beta - 2)) - 3/2
B = (G*M*10**12)/((beta -2 )*( R**(3 - beta)))
K = (gamma - 3)/((rmin**(3 - gamma))*funcIntZ(beta)*m.sqrt(2*B))
logp= -np.log(K*((1 -(( 1/(2*B) )*((vn[i]**2)*rn[i]**(beta -
2))))**(q+1))*(rn[i]**(1-gamma +(beta/2))))
arr.append(logp.sum())
def logp_func(rn,vn):
return min(np.array(arr))
logpvar = pm.DensityDist("logpvar", logp_func, observed={"rn": rn,"vn":vn})
start = pm.find_MAP(model=basic_model)
step = pm.Metropolis()
basicmodeltrace = pm.sample(10000, step=step,
start=start,random_seed=1,progressbar=True)
print(pm.summary(basicmodeltrace))
map_estimate = pm.find_MAP(model=basic_model)
print(map_estimate)
I am getting the following error message:
ValueError: Cannot compute test value: input 0 (theta) of Op
Elemwise{cos,no_inplace}(theta) missing default value.
Backtrace when that variable is created:
I am unable to get the output since the numerical integration is not working. I have used custom theano op for numerical integration code which i got from Custom Theano Op to do numerical integration . The integration works if I run it seperately inputting a particular value of beta, but not within the model.
I made a few changes to your code, this still does not work, but I hope it is closer to a solution. Please check this thread, as someone is trying so solve essentially the same problem.
class integrateOut(theano.Op):
def __init__(self, f, t, t0, tf,*args, **kwargs):
super(integrateOut,self).__init__()
self.f = f
self.t = t
self.t0 = t0
self.tf = tf
def make_node(self, *inputs):
self.fvars=list(inputs)
try:
self.gradF = tt.grad(self.f, self.fvars)
except:
self.gradF = None
return theano.Apply(self, self.fvars, [tt.dscalar().type()])
def perform(self,node, inputs, output_storage):
args = tuple(inputs)
f = theano.function([self.t] + self.fvars,self.f)
output_storage[0][0] = quad(f, self.t0, self.tf, args=args)[0]
def grad(self,inputs,grads):
return [integrateOut(g, self.t, self.t0, self.tf)(*inputs)*grads[0] \
for g in self.gradF]
gamma = 3.77
G = 4.302E-6
rmin = 3.0
R = 95.7
vr = data[:,1]
r = data[:,0]
h = np.pi
interval = 1E10
vnew = []
rnew = []
for j in np.arange(0,59,1):
vnew.append(vr[j]+(0.05*vr[j] * float(dm.Decimal(rm.randrange(1, 20))/10)))
rnew.append(r[j]+(0.05*r[j] * float(dm.Decimal(rm.randrange(1, 20))/10)))
vn = np.array(vnew)
rn = np.array(rnew)
def integ(gamma, beta, theta):
z = tt.cos(theta)**(2*((gamma/(beta - 2)) - 3/2) + 3)
return integrateOut(z, theta, -(np.pi)/2, (np.pi)/2)(beta)
with pm.Model() as basic_model:
M = pm.Uniform('M', lower=0.5*10**12, upper=3.50*10**12)
beta = pm.Uniform('beta', lower=2.001, upper=2.999)
theta = pm.Normal('theta', 0, 10**2)
def logp_func(rn,vn):
q = (gamma/(beta - 2)) - 3/2
B = (G*M*1E12) / ((beta -2 )*(R**(3 - beta)))
K = (gamma - 3) / ((rmin**(3 - gamma)) * integ(gamma, beta, theta) * (2*B)**0.5)
logp = - np.log(K*((1 -((1/(2*B))*((vn**2)*rn**(beta -
2))))**(q+1))*(rn**(1-gamma +(beta/2))))
return logp.sum()
logpvar = pm.DensityDist("logpvar", logp_func, observed={"rn": rn,"vn":vn})
start = pm.find_MAP()
#basicmodeltrace = pm.sample()
print(start)

Splitting array in python many columns

I'm use to splitting arrays
a = open('filename.txt','r')
b = a.readlines()
a.close()
c = len(b)
d = list(zip(*(e.split() for e in b)))
self.r11 = []
self.r21 = []
self.yr = []
self.mn = []
self.dy = []
self.f = d[0]
self.g = d[1]
self.h = d[2]
self.i = d[3]
self.j = d[4]
for n in range(2, c):
nn = int(self.f[n])
if nn > mm:
self.yr.append(self.f[n])
self.mn.append(self.g[n])
self.dy.append(self.h[n])
self.r11.append(self.i[n])
self.r12.append(self.j[n])
etc...
I now have a csv database that is much bigger at least in terms of the number of columns(144). Total grid is 144,73.
What is the easiest way of splitting off the data. I figure you would want to put it into a multidimensional array but what is the easiest way of doing it? I will be working with both, depending on when I'm working with and at what level I'm working with it, one file at a time and multiple files at a time(aka, multiple 144,73 files).

Resources