keras:how to get initial loss function value before training - keras

In Keras, I checked the callbacks mechanism.
However it does not provide any information before the start of training.Like the output is always after epoch = 1.I would like to check the value of the loss function for the first time feed forward.How can I achieve this?
Thanks.
This answer does not work.
'setting model.trainable = False and then train the model'.How to perform feed forward propagation in CNN using Keras?
I set model.trainable = False before compiling the model, but the model still output different loss functions.This is weird.It is supposed to output a constant loss which is the loss when a forward-feed is performed.
The code is in the following:
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Input
from keras.models import Sequential
import numpy as np
import random
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers.core import Dropout,Activation,Flatten,Lambda
from keras.layers.normalization import BatchNormalization
import keras
import time
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from ann_visualizer.visualize import ann_viz;
def gen_x(n,p,rho):
if abs(rho) < 1 :
beta=np.sqrt(rho/(1-rho))
x0=np.random.normal(size=(n,p))
z=np.random.normal(size=(n,1))
x=beta*np.repeat(z,repeats=p,axis=1)+x0
if abs(rho)==1:
x=np.repeat(z,repeats=p,axis=1)
return x
## This function creates true survival times as described in section 3 of the paper. In all simulations we set snr (signal to noise ratio) to 3.
def genecoef(p):
#return list( map(lambda x : np.power(-1,x)*np.exp(-0.1*(x-1)), np.arange(1,p+1,1)) )
return list( np.random.rand(p) )
def gen_times(x,snr):
n,p=x.shape
coef=genecoef(p)
f=np.matmul(np.matrix(x),np.matrix(coef).T)
e=np.random.normal(size=(n,1))
k=np.sqrt(np.var(f)/(snr*np.var(e)))
y=np.exp(f+k*e)
return(y)
## This function creates true survival times as described in section 3 of the paper. In all simulations we set snr (signal to noise ratio) to 3.
def gen_times_censor(x,snr):
n,p=x.shape
coef=genecoef(p)
f=np.matmul(np.matrix(x),np.matrix(coef).T)
e=np.random.normal(size=(n,1))
k=np.sqrt(np.var(f)/(snr*np.var(e)))
y=np.exp(k*e)
return(y)
def nltr(x):
y1 = x[:,0]*x[:,1]
y2 = x[:,2]*x[:,3]
y3 = x[:,4]**2
y4 = x[:,5]* (x[:,6]**2)
y5 = x[:,7]*x[:,8]* x[:,9]
y6 = 0.5 *np.exp(x[:,8]* x[:,9])
newx = np.column_stack((y1,y2,y3,y4,y5,y6))
return newx
def survdata(n,p,snr,rho):
x = gen_x(n,p,rho)
time = gen_times(x,snr)
censortime = gen_times_censor(x,snr)
y = np.apply_along_axis(np.min,1,np.column_stack((time,censortime)))
y = np.array(y)
#b==0 censored b ==1 uncensored
b = np.apply_along_axis(np.argmax,1,np.column_stack((time,censortime)))
b = np.array(b)
a = x
ordery=np.argsort(y)
a=a[ordery]
y=y[ordery]
b=b[ordery]
Rlist=[]
event_index=np.argwhere(b==1).ravel().astype(np.int32)
nsample=len(b)
nevent=sum(b)
Rlist=[]
for j in range(nevent):
Rlist+=[ list(range(np.argwhere(b==1).ravel()[j],nsample) )]
bmask = b.astype(bool)
cumlist=list(reversed(np.append(event_index,n)))
slarr=np.vectorize(lambda x:(len(x)-1))
nctrue = np.sum(slarr(Rlist))
#a:n(#samples)*p(#features) matrix,survival time from short to high
#y:survival time
#b censored(0) or not(1)
#bmask bool(b)
#nevent #uncensored
return a,y,b,bmask,nsample,nevent,event_index,Rlist,cumlist,nctrue
n=50
p=10
snr=1
rho=0.1
a,y,b,bmask,nsample,nevent,event_index,Rlist,cumlist,nctrue= survdata(n,p,snr,rho)
sc=StandardScaler()
a=nltr(a)
a=sc.fit_transform(a)
def ploss(y_true,y_pred):
#y_pred for sample x_i is the value of np.dot(x_i,beta) in the linear cox case
#y_pred is the loss for sample i
z = 0
#for j in event_index:
#z = z + K.sum(y_pred[j,0])
#z = z + K.constant(y_pred[j,0])
#z = K.sum(tf.boolean_mask(y_pred,bmask) )
#iz = K.print_tensor(tf.boolean_mask(y_pred,bmask),'y_pred_mask is')
#gz = K.print_tensor(K.gather(y_pred,event_index),'y_pred_gather is')
z = K.sum(K.gather(y_pred,event_index))
currentsum = 0
for j in range(nevent):
currentsum = currentsum + K.sum(K.exp(K.gather(y_pred,\
np.array(range(cumlist[j+1],cumlist[j])))))
z = z - K.log(currentsum)
#tempz=0
#for i in j:
#tempz = tempz + K.exp(y_pred[i,0])
#z = z - K.log(tempz)
z = -z
return z
def c_index_func(y_true, y_pred):
#y_pred is the loss for sample i
c_hat = 0
for i in range(nevent-1):
c_hat = c_hat + K.sum(K.cast(y_pred[event_index[i]+1:,0]\
<y_pred[event_index[i],0],tf.float32))
#c_hat = c_hat + K.sum(K.cast(y_pred[event_index[i]+1:,0]\
#<y_pred[event_index[i],0],float32))
return c_hat/nctrue
model=Sequential()
model.add(Dense(1,activation='linear',kernel_initializer='one',\
batch_input_shape=(a.shape[0],a.shape[1])))
#model.add(Dropout(0.2))
#model.compile(loss=ploss,optimizer='newton-raphson')
#model.compile(loss=ploss,optimizer=keras.optimizers.Adam(lr=0, beta_1=0.9, beta_2=0.999, \
#epsilon=None, decay=0.0, amsgrad=False),metrics=[c_index_func])
model.trainable=False
model.compile(loss=ploss,optimizer=keras.optimizers.SGD(lr=0.001, momentum=0.0, \
decay=0.0, nesterov=False),metrics=[c_index_func])
model.fit(x=a,y=y,batch_size=len(a),epochs=3,verbose=2)

For this you can just use model.evaluate(x, y) and it will return an array with the loss and metrics. The first element of this array is the loss on the given data. Just do this before training and it will give you the initial loss.

it is very easy just make the learning rate = 0 and train the DNN then all the losses are the initial loss

Related

Predicting classes in MNIST dataset with a Gaussian- the same prediction errors with different paramemters?

I am trying to find the best c parameter following the instructions to a task that asks me to ' Define a function, fit_generative_model, that takes as input a training set (train_data, train_labels) and fits a Gaussian generative model to it. It should return the parameters of this generative model; for each label j = 0,1,...,9, where
pi[j]: the frequency of that label
mu[j]: the 784-dimensional mean vector
sigma[j]: the 784x784 covariance matrix
It is important to regularize these matrices. The standard way of doing this is to add cI to them, where c is some constant and I is the 784-dimensional identity matrix. c is now a parameter, and by setting it appropriately, we can improve the performance of the model.
%matplotlib inline
import sys
import matplotlib.pyplot as plt
import gzip, os
import numpy as np
from scipy.stats import multivariate_normal
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# Downloads the dataset
def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
print("Downloading %s" % filename)
urlretrieve(source + filename, filename)
# Invokes download() if necessary, then reads in images
def load_mnist_images(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1,784)
return data
def load_mnist_labels(filename):
if not os.path.exists(filename):
download(filename)
with gzip.open(filename, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=8)
return data
## Load the training set
train_data = load_mnist_images('train-images-idx3-ubyte.gz')
train_labels = load_mnist_labels('train-labels-idx1-ubyte.gz')
## Load the testing set
test_data = load_mnist_images('t10k-images-idx3-ubyte.gz')
test_labels = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
train_data.shape, train_labels.shape
So I have written this code for three different C-values. they each give me the same error?
def fit_generative_model(x,y):
lst=[]
for c in [20,200, 4000]:
k = 10 # labels 0,1,...,k-1
d = (x.shape)[1] # number of features
mu = np.zeros((k,d))
sigma = np.zeros((k,d,d))
pi = np.zeros(k)
for label in range(0,k):
indices = (y == label)
mu[label] = np.mean(x[indices,:], axis=0)
sigma[label] = np.cov(x[indices,:], rowvar=0, bias=1) + c*np.identity(784) # I define the identity matrix
predictions = np.argmax(score, axis=1)
errors = np.sum(predictions != y)
lst.append(errors)
print(c,"Model makes " + str(errors) + " errors out of 10000", lst)
Then I fit it to the training data and get these same errors:
mu, sigma, pi = fit_generative_model(train_data, train_labels)
20 Model makes 1 errors out of 10000 [1]
200 Model makes 1 errors out of 10000 [1, 1]
4000 Model makes 1 errors out of 10000 [1, 1, 1]
and to the test data:
mu, sigma, pi = fit_generative_model(test_data, test_labels)
20 Model makes 9020 errors out of 10000 [9020]
200 Model makes 9020 errors out of 10000 [9020, 9020]
4000 Model makes 9020 errors out of 10000 [9020, 9020, 9020]
What is it I'm doing wrong? the correct answer is c=4000 which yields an error of ~4.3%.

How to run a proper Bayesian Logistic Regression

I'm trying to run a bayesian logistic regression on the wine dataset provided from the sklearn package. As variables, I decided to use alcohol, color_intensity, flavanoids, hue and magnesium where alcohol is my response variable and the rest the predictors. To do so, I'm using pyro and torch packages:
import pyro
import torch
import pyro.distributions as dist
import pyro.optim as optim
from pyro.infer import SVI, Trace_ELBO
import pandas as pd
import numpy as np
from pyro.infer import Predictive
import torch.distributions.constraints as constraints
from sklearn import datasets
pyro.set_rng_seed(0)
#loading data and prepearing dataframe
wine = datasets.load_wine()
data = pd.DataFrame(columns = wine['feature_names'], data=wine['data'] )
#choosiing variables: response and predictors
variables = data[['alcohol', 'color_intensity', 'flavanoids', 'hue', 'magnesium']]
#standardization
variables = (variables-variables.min())/(variables.max()-variables.min())
#tensorizing
alcohol = torch.tensor(variables['alcohol'].values, dtype=torch.float)
predictors = torch.stack([torch.tensor(variables[column].values, dtype=torch.float)
for column in ['alcohol', 'color_intensity', 'flavanoids', 'hue', 'magnesium']], 1)
#splitting data
k = int(0.8 * len(variables))
x_train, y_train = predictors[:k], alcohol[:k]
x_test, y_test = predictors[k:], alcohol[k:]
#modelling
def model_alcohol(predictors, alcohol):
n_observations, n_predictors = predictors.shape
#weights
w = pyro.sample('w', dist.Normal(torch.zeros(n_predictors), torch.ones(n_predictors)))
epsilon = pyro.sample('epsilon', dist.Normal(0.,1.))
#non-linearity
y_hat = torch.sigmoid((w*predictors).sum(dim=1) + epsilon)
sigma = pyro.sample("sigma", dist.Uniform(0.,3.))
with pyro.plate('alcohol', len(alcohol)):
y=pyro.sample('y', dist.Normal(y_hat, sigma), obs=alcohol)
def guide_alcohol(predictors, alcohol=None):
n_observations, n_predictors = predictors.shape
w_loc = pyro.param('w_loc', torch.rand(n_predictors))
w_scale = pyro.param('w_scale', torch.rand(n_predictors), constraint=constraints.positive)
w = pyro.sample('w', dist.Normal(w_loc, w_scale))
epsilon_loc = pyro.param('b_loc', torch.rand(1))
epsilon_scale = pyro.param('b_scale', torch.rand(1), constraint=constraints.positive)
epsilon = pyro.sample('epsilon', dist.Normal(epsilon_loc, epsilon_scale))
sigma_loc = pyro.param('sigma_loc', torch.rand(n_predictors))
sigma_scale = pyro.param('sigma_scale', torch.rand(n_predictors),
constraint=constraints.positive)
sigma = pyro.sample('sigma', dist.Normal(sigma_loc, sigma_scale))
alcohol_svi = SVI(model=model_alcohol, guide=guide_alcohol, optim=optim.ClippedAdam({'lr' : 0.0002}),
loss=Trace_ELBO())
losses = []
for step in range(10000):
loss = alcohol_svi.step(x_train, y_train)/len(x_train)
losses.append(loss)
As I have to use Stochastic Variational Inference, I have defined both the model and the guide. My problem is now at matching tensor sizes, as I now I get the error:
RuntimeError: The size of tensor a (142) must match the size of tensor b (5) at non-singleton
dimension 0
Trace Shapes:
Param Sites:
Sample Sites:
w dist 5 |
value 5 |
epsilon dist |
value 1 |
sigma dist |
value 5 |
alcohol dist |
value 142 |
I'm kinda new to the idea of modelling on my own, so clearly there are mistakes around the code (hopefully not on the theory behind it). Still, I see I should adjust dimension on the guide maybe? I'm not entirely sure on how to honestly.
Your main problem is that w is not declared as a single event (.to_event(1)), and your variance (sigma) should have the same dim as your observations (()). The model and guide below fix this; I suggest you look at auto-generated guides in Pyro, and a different prior on sigma.
def model_alcohol(predictors, alcohol):
n_observations, n_predictors = predictors.shape
# weights
# w is a single event
w = pyro.sample('w', dist.Normal(torch.zeros(n_predictors), torch.ones(n_predictors)).to_event(1))
epsilon = pyro.sample('epsilon', dist.Normal(0., 1.))
# non-linearity
y_hat = torch.sigmoid(predictors # w + epsilon) # (predictors * weight).sum(1) == predictors # w
sigma = pyro.sample("sigma", dist.Uniform(0., 3.))
with pyro.plate('alcohol', len(alcohol)):
pyro.sample('y', dist.Normal(y_hat, sigma), obs=alcohol)
def guide_alcohol(predictors, alcohol=None):
n_observations, n_predictors = predictors.shape
w_loc = pyro.param('w_loc', torch.rand(n_predictors))
w_scale = pyro.param('w_scale', torch.rand(n_predictors), constraint=constraints.positive)
pyro.sample('w', dist.Normal(w_loc, w_scale).to_event(1))
epsilon_loc = pyro.param('b_loc', torch.rand(1))
epsilon_scale = pyro.param('b_scale', torch.rand(1), constraint=constraints.positive)
epsilon = pyro.sample('epsilon', dist.Normal(epsilon_loc, epsilon_scale))
sigma_loc = pyro.param('sigma_loc', torch.rand(1))
sigma_scale = pyro.param('sigma_scale', torch.rand(1),
constraint=constraints.positive)
pyro.sample('sigma', dist.HalfNormal(sigma_loc, sigma_scale)) # MUST BE POSITIVE

Argument must be a string or a number issue, Not 'Type' - Pyspark

Update:
So i have been looking into the issue, the problem is with scikit-multiflow datastream. in last quarter of code stream_clf.partial_fit(X,y, classes=stream.target_values) here the class valuefor stream.target_values should a number or string, but the method is returning (dtype). When i print or loop stream.target_values i get this:
I have tried to do conversion etc. but still of no use. can someone please help here ?
Initial Problem
I am running a code (took inspiration from here). It works perfectly alright when used vanilla python environment.
But if i run this code after certain modification in Apache Spark using Pyspark , i get the following error
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
I have tried every possibile way to trace the issue but everything looks alright. The error arises from the last line of the code where hoefding tree is called for prediction. It expects an ndarray and the type of X variable is also ndarray. I am not sure what is trigerring the issue. Can some one please help or direct me to right trace?
complete stack of error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-52-1310132c88db> in <module>
30 D3_win.addInstance(X,y)
31 xx = np.array(X,dtype='float64')
---> 32 y_hat = stream_clf.predict(xx)
33
34
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict(self, X)
1068 r, _ = get_dimensions(X)
1069 predictions = []
-> 1070 y_proba = self.predict_proba(X)
1071 for i in range(r):
1072 index = np.argmax(y_proba[i])
~/conceptDrift/projectTest/lib/python3.5/site-packages/skmultiflow/trees/hoeffding_tree.py in predict_proba(self, X)
1099 votes = normalize_values_in_dict(votes, inplace=False)
1100 if self.classes is not None:
-> 1101 y_proba = np.zeros(int(max(self.classes)) + 1)
1102 else:
1103 y_proba = np.zeros(int(max(votes.keys())) + 1)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'type'
Code
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import functions as fn
import sys
from pyspark import SparkContext,SparkConf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score as AUC
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from skmultiflow.trees.hoeffding_tree import HoeffdingTree
from skmultiflow.data.data_stream import DataStream
import time
def drift_detector(S,T,threshold = 0.75):
T = pd.DataFrame(T)
#print(T)
S = pd.DataFrame(S)
# Give slack variable in_target which is 1 for old and 0 for new
T['in_target'] = 0 # in target set
S['in_target'] = 1 # in source set
# Combine source and target with new slack variable
ST = pd.concat( [T, S], ignore_index=True, axis=0)
labels = ST['in_target'].values
ST = ST.drop('in_target', axis=1).values
# You can use any classifier for this step. We advise it to be a simple one as we want to see whether source
# and target differ not to classify them.
clf = LogisticRegression(solver='liblinear')
predictions = np.zeros(labels.shape)
# Divide ST into two equal chunks
# Train LR on a chunk and classify the other chunk
# Calculate AUC for original labels (in_target) and predicted ones
skf = StratifiedKFold(n_splits=2, shuffle=True)
for train_idx, test_idx in skf.split(ST, labels):
X_train, X_test = ST[train_idx], ST[test_idx]
y_train, y_test = labels[train_idx], labels[test_idx]
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)[:, 1]
predictions[test_idx] = probs
auc_score = AUC(labels, predictions)
print(auc_score)
# Signal drift if AUC is larger than the threshold
if auc_score > threshold:
return True
else:
return False
class D3():
def __init__(self, w, rho, dim, auc):
self.size = int(w*(1+rho))
self.win_data = np.zeros((self.size,dim))
self.win_label = np.zeros(self.size)
self.w = w
self.rho = rho
self.dim = dim
self.auc = auc
self.drift_count = 0
self.window_index = 0
def addInstance(self,X,y):
if(self.isEmpty()):
self.win_data[self.window_index] = X
self.win_label[self.window_index] = y
self.window_index = self.window_index + 1
else:
print("Error: Buffer is full!")
def isEmpty(self):
return self.window_index < self.size
def driftCheck(self):
if drift_detector(self.win_data[:self.w], self.win_data[self.w:self.size], auc): #returns true if drift is detected
self.window_index = int(self.w * self.rho)
self.win_data = np.roll(self.win_data, -1*self.w, axis=0)
self.win_label = np.roll(self.win_label, -1*self.w, axis=0)
self.drift_count = self.drift_count + 1
return True
else:
self.window_index = self.w
self.win_data = np.roll(self.win_data, -1*(int(self.w*self.rho)), axis=0)
self.win_label =np.roll(self.win_label, -1*(int(self.w*self.rho)), axis=0)
return False
def getCurrentData(self):
return self.win_data[:self.window_index]
def getCurrentLabels(self):
return self.win_label[:self.window_index]
def select_data(x):
x = "/user/hadoop1/tellus/sea_1.csv"
peopleDF = spark.read.csv(x, header= True)
df = peopleDF.toPandas()
scaler = MinMaxScaler()
df.iloc[:,0:df.shape[1]-1] = scaler.fit_transform(df.iloc[:,0:df.shape[1]-1])
return df
def check_true(y,y_hat):
if(y==y_hat):
return 1
else:
return 0
df = select_data("/user/hadoop1/tellus/sea_1.csv")
stream = DataStream(df)
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(2000)
rho = float(0.4)
auc = float(0.60)
# In[ ]:
D3_win = D3(w,rho,stream.n_features,auc)
stream_acc = []
stream_record = []
stream_true= 0
i=0
start = time.time()
X,y = stream.next_sample(int(w*rho))
stream_clf.partial_fit(X,y, classes=stream.target_values)
while(stream.has_more_samples()):
X,y = stream.next_sample()
if D3_win.isEmpty():
D3_win.addInstance(X,y)
y_hat = stream_clf.predict(X)
Problem was with select_data() function, data type of variables was being changed during the execution. This issue is fixed now.

Fit CDF with 2 Gaussian using LeastSq

I am trying to fit empirical CDF plot to two Gaussian cdf as it seems that it has two peaks, but it does not work. I fit the curve with leastsq from scipy.optimize and erf function from scipy.special. The fitting only gives constant line at a value of 2. I am not sure in which part of the code that I make mistake. Any pointers will be helpful. Thanks!
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
x = np.array([ 90.64115156, 90.85690063, 91.07264971, 91.28839878,
91.50414786, 91.71989693, 91.93564601, 92.15139508,
92.36714415, 92.58289323, 92.7986423 , 93.01439138,
93.23014045, 93.44588953, 93.6616386 , 93.87738768,
94.09313675, 94.30888582, 94.5246349 , 94.74038397,
94.95613305, 95.17188212, 95.3876312 , 95.60338027,
95.81912935, 96.03487842, 96.2506275 , 96.46637657,
96.68212564, 96.89787472, 97.11362379, 97.32937287,
97.54512194, 97.76087102, 97.97662009, 98.19236917,
98.40811824, 98.62386731, 98.83961639, 99.05536546,
99.27111454, 99.48686361, 99.70261269, 99.91836176,
100.13411084, 100.34985991, 100.56560899, 100.78135806,
100.99710713, 101.21285621])
y = np.array([3.33333333e-04, 3.33333333e-04, 3.33333333e-04, 1.00000000e-03,
1.33333333e-03, 3.33333333e-03, 6.66666667e-03, 1.30000000e-02,
2.36666667e-02, 3.40000000e-02, 5.13333333e-02, 7.36666667e-02,
1.01666667e-01, 1.38666667e-01, 2.14000000e-01, 3.31000000e-01,
4.49666667e-01, 5.50000000e-01, 6.09000000e-01, 6.36000000e-01,
6.47000000e-01, 6.54666667e-01, 6.61000000e-01, 6.67000000e-01,
6.76333333e-01, 6.84000000e-01, 6.95666667e-01, 7.10000000e-01,
7.27666667e-01, 7.50666667e-01, 7.75333333e-01, 7.93333333e-01,
8.11333333e-01, 8.31333333e-01, 8.56333333e-01, 8.81333333e-01,
9.00666667e-01, 9.22666667e-01, 9.37666667e-01, 9.47333333e-01,
9.59000000e-01, 9.70333333e-01, 9.77333333e-01, 9.83333333e-01,
9.90333333e-01, 9.93666667e-01, 9.96333333e-01, 9.99000000e-01,
9.99666667e-01, 1.00000000e+00])
plt.plot(a,b,'r.')
# Fitting with 2 Gaussian
from scipy.special import erf
from scipy.optimize import leastsq
def two_gaussian_cdf(params, x):
(mu1, sigma1, mu2, sigma2) = params
model = 0.5*(1 + erf( (x-mu1)/(sigma1*np.sqrt(2)) )) +\
0.5*(1 + erf( (x-mu2)/(sigma2*np.sqrt(2)) ))
return model
def residual_two_gaussian_cdf(params, x, y):
model = double_gaussian(params, x)
return model - y
params = [5.,2.,1.,2.]
out = leastsq(residual_two_gaussian_cdf,params,args=(x,y))
double_gaussian(out[0],x)
plt.plot(x,two_gaussian_cdf(out[0],x))
which return to this plot
You may find lmfit (see http://lmfit.github.io/lmfit-py/) to be a useful alternative to leastsq here as it provides a higher-level interface to optimization and curve fitting (though still based on scipy.optimize.leastsq). With lmfit, your example might look like this (cutting out the definition of x and y data):
#!/usr/bin/env python
import numpy as np
from scipy.special import erf
import matplotlib.pyplot as plt
from lmfit import Model
# define the basic model. I included an amplitude parameter
def gaussian_cdf(x, amp, mu, sigma):
return (amp/2.0)*(1 + erf( (x-mu)/(sigma*np.sqrt(2))))
# create a model that is the sum of two gaussian_cdfs
# note that a prefix names each component and will be
# applied to the parameter names for each model component
model = Model(gaussian_cdf, prefix='g1_') + Model(gaussian_cdf, prefix='g2_')
# make a parameters object -- a dict with parameter names
# taken from the arguments of your model function and prefix
params = model.make_params(g1_amp=0.50, g1_mu=94, g1_sigma=1,
g2_amp=0.50, g2_mu=98, g2_sigma=1.)
# you can apply bounds to any parameter
#params['g1_sigma'].min = 0 # sigma must be > 0!
# you may want to fix the amplitudes to 0.5:
#params['g1_amp'].vary = False
#params['g2_amp'].vary = False
# run the fit
result = model.fit(y, params, x=x)
# print results
print(result.fit_report())
# plot results, including individual components
comps = result.eval_components(result.params, x=x)
plt.plot(x, y,'r.', label='data')
plt.plot(x, result.best_fit, 'k-', label='fit')
plt.plot(x, comps['g1_'], 'b--', label='g1_')
plt.plot(x, comps['g2_'], 'g--', label='g2_')
plt.legend()
plt.show()
This prints out a report of
[[Model]]
(Model(gaussian_cdf, prefix='g1_') + Model(gaussian_cdf, prefix='g2_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 66
# data points = 50
# variables = 6
chi-square = 0.00626332
reduced chi-square = 1.4235e-04
Akaike info crit = -437.253376
Bayesian info crit = -425.781238
[[Variables]]
g1_amp: 0.65818908 +/- 0.00851338 (1.29%) (init = 0.5)
g1_mu: 93.8438526 +/- 0.01623273 (0.02%) (init = 94)
g1_sigma: 0.54362156 +/- 0.02021614 (3.72%) (init = 1)
g2_amp: 0.34058664 +/- 0.01153346 (3.39%) (init = 0.5)
g2_mu: 97.7056728 +/- 0.06408910 (0.07%) (init = 98)
g2_sigma: 1.24891832 +/- 0.09204020 (7.37%) (init = 1)
[[Correlations]] (unreported correlations are < 0.100)
C(g1_amp, g2_amp) = -0.892
C(g2_amp, g2_sigma) = 0.848
C(g1_amp, g2_sigma) = -0.744
C(g1_amp, g1_mu) = 0.692
C(g1_amp, g2_mu) = 0.662
C(g1_mu, g2_amp) = -0.607
C(g1_amp, g1_sigma) = 0.571
and a plot like this:
This fit is not perfect, but it should get you started.
Here is how I used the scipy.optimize.differential_evolution module to generate initial parameter estimates for curve fitting. I have coded the sum of squared errors as the target for the genetic algorithm as shown below. This scipy module uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, which requires parameter bounds within which to search. In this case, the parameter bounds are automatically derived from the data so that there is no need to provide them manually in the code.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import warnings
from scipy.optimize import differential_evolution
from scipy.special import erf
# bounds on parameters are set in generate_Initial_Parameters() below
def two_gaussian_cdf(x, mu1, sigma1, mu2, sigma2):
model = 0.5*(1 + erf( (x-mu1)/(sigma1*np.sqrt(2)) )) +\
0.5*(1 + erf( (x-mu2)/(sigma2*np.sqrt(2)) ))
return model
# function for genetic algorithm to minimize (sum of squared error)
# bounds on parameters are set in generate_Initial_Parameters() below
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
return np.sum((yData - two_gaussian_cdf(xData, *parameterTuple)) ** 2)
def generate_Initial_Parameters():
# data min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minX, maxX]) # parameter bounds for mu1
parameterBounds.append([minY, maxY]) # parameter bounds for sigma1
parameterBounds.append([minX, maxX]) # parameter bounds for mu2
parameterBounds.append([minY, maxY]) # parameter bounds for sigma2
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
xData = np.array([ 90.64115156, 90.85690063, 91.07264971, 91.28839878,
91.50414786, 91.71989693, 91.93564601, 92.15139508,
92.36714415, 92.58289323, 92.7986423 , 93.01439138,
93.23014045, 93.44588953, 93.6616386 , 93.87738768,
94.09313675, 94.30888582, 94.5246349 , 94.74038397,
94.95613305, 95.17188212, 95.3876312 , 95.60338027,
95.81912935, 96.03487842, 96.2506275 , 96.46637657,
96.68212564, 96.89787472, 97.11362379, 97.32937287,
97.54512194, 97.76087102, 97.97662009, 98.19236917,
98.40811824, 98.62386731, 98.83961639, 99.05536546,
99.27111454, 99.48686361, 99.70261269, 99.91836176,
100.13411084, 100.34985991, 100.56560899, 100.78135806,
100.99710713, 101.21285621])
yData = np.array([3.33333333e-04, 3.33333333e-04, 3.33333333e-04, 1.00000000e-03,
1.33333333e-03, 3.33333333e-03, 6.66666667e-03, 1.30000000e-02,
2.36666667e-02, 3.40000000e-02, 5.13333333e-02, 7.36666667e-02,
1.01666667e-01, 1.38666667e-01, 2.14000000e-01, 3.31000000e-01,
4.49666667e-01, 5.50000000e-01, 6.09000000e-01, 6.36000000e-01,
6.47000000e-01, 6.54666667e-01, 6.61000000e-01, 6.67000000e-01,
6.76333333e-01, 6.84000000e-01, 6.95666667e-01, 7.10000000e-01,
7.27666667e-01, 7.50666667e-01, 7.75333333e-01, 7.93333333e-01,
8.11333333e-01, 8.31333333e-01, 8.56333333e-01, 8.81333333e-01,
9.00666667e-01, 9.22666667e-01, 9.37666667e-01, 9.47333333e-01,
9.59000000e-01, 9.70333333e-01, 9.77333333e-01, 9.83333333e-01,
9.90333333e-01, 9.93666667e-01, 9.96333333e-01, 9.99000000e-01,
9.99666667e-01, 1.00000000e+00])
# generate initial parameter values
initialParameters = generate_Initial_Parameters()
# curve fit the data
fittedParameters, niepewnosci = curve_fit(two_gaussian_cdf, xData, yData, initialParameters)
# create values for display of fitted peak function
mu1, sigma1, mu2, sigma2 = fittedParameters
y_fit = two_gaussian_cdf(xData, mu1, sigma1, mu2, sigma2)
plt.plot(xData, yData) # plot the raw data
plt.plot(xData, y_fit) # plot the equation using the fitted parameters
plt.show()
print(fittedParameters)

Outputting coefficients when running linear regression using sklearn

I'm attempting to run a simple linear regression on a data set and retrieve the coefficients. The data which is from a a .csv file looks like:
"","time","LakeHuron"
"1",1875,580.38
"2",1876,581.86
"3",1877,580.97
"4",1878,580.8
...
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
def Main():
location = r"~/Documents/Time Series/LakeHuron.csv"
ts = pd.read_csv(location, sep=",", parse_dates=[0], header=None)
ts.drop(ts.columns[[0]], axis=1, inplace=True)
length = len(ts)
x = ts[1].values
y = ts[2].values
x = x.reshape(length, 1)
y = y.reshape(length, 1)
regr = linear_model.LinearRegression()
regr.fit(x, y)
print(regr.coef_)
if __name__ == "__main__":
Main()
Since this is a simple linear model then $Y_t = a_0 + a_1*t$, which in this case should be $Y_t = 580.202 -0.0242t$. and all that prints out when running the above code is [[-0.02420111]]. Is there anyway to get the second coefficient 580.202?
I've had a look at the documentation on http://scikit-learn.org/stable/modules/linear_model.html and it outputs two variables in the array.
Look like you only have one X and one Y, So output is correct.
Try this:
#coef_ : array, shape (n_features, ) or (n_targets, n_features)
print(regr.coef_)
#intercept_ : array Independent term in the linear model.
print(regr.intercept_)
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression

Resources