Fit CDF with 2 Gaussian using LeastSq - python-3.x

I am trying to fit empirical CDF plot to two Gaussian cdf as it seems that it has two peaks, but it does not work. I fit the curve with leastsq from scipy.optimize and erf function from scipy.special. The fitting only gives constant line at a value of 2. I am not sure in which part of the code that I make mistake. Any pointers will be helpful. Thanks!
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
x = np.array([ 90.64115156, 90.85690063, 91.07264971, 91.28839878,
91.50414786, 91.71989693, 91.93564601, 92.15139508,
92.36714415, 92.58289323, 92.7986423 , 93.01439138,
93.23014045, 93.44588953, 93.6616386 , 93.87738768,
94.09313675, 94.30888582, 94.5246349 , 94.74038397,
94.95613305, 95.17188212, 95.3876312 , 95.60338027,
95.81912935, 96.03487842, 96.2506275 , 96.46637657,
96.68212564, 96.89787472, 97.11362379, 97.32937287,
97.54512194, 97.76087102, 97.97662009, 98.19236917,
98.40811824, 98.62386731, 98.83961639, 99.05536546,
99.27111454, 99.48686361, 99.70261269, 99.91836176,
100.13411084, 100.34985991, 100.56560899, 100.78135806,
100.99710713, 101.21285621])
y = np.array([3.33333333e-04, 3.33333333e-04, 3.33333333e-04, 1.00000000e-03,
1.33333333e-03, 3.33333333e-03, 6.66666667e-03, 1.30000000e-02,
2.36666667e-02, 3.40000000e-02, 5.13333333e-02, 7.36666667e-02,
1.01666667e-01, 1.38666667e-01, 2.14000000e-01, 3.31000000e-01,
4.49666667e-01, 5.50000000e-01, 6.09000000e-01, 6.36000000e-01,
6.47000000e-01, 6.54666667e-01, 6.61000000e-01, 6.67000000e-01,
6.76333333e-01, 6.84000000e-01, 6.95666667e-01, 7.10000000e-01,
7.27666667e-01, 7.50666667e-01, 7.75333333e-01, 7.93333333e-01,
8.11333333e-01, 8.31333333e-01, 8.56333333e-01, 8.81333333e-01,
9.00666667e-01, 9.22666667e-01, 9.37666667e-01, 9.47333333e-01,
9.59000000e-01, 9.70333333e-01, 9.77333333e-01, 9.83333333e-01,
9.90333333e-01, 9.93666667e-01, 9.96333333e-01, 9.99000000e-01,
9.99666667e-01, 1.00000000e+00])
plt.plot(a,b,'r.')
# Fitting with 2 Gaussian
from scipy.special import erf
from scipy.optimize import leastsq
def two_gaussian_cdf(params, x):
(mu1, sigma1, mu2, sigma2) = params
model = 0.5*(1 + erf( (x-mu1)/(sigma1*np.sqrt(2)) )) +\
0.5*(1 + erf( (x-mu2)/(sigma2*np.sqrt(2)) ))
return model
def residual_two_gaussian_cdf(params, x, y):
model = double_gaussian(params, x)
return model - y
params = [5.,2.,1.,2.]
out = leastsq(residual_two_gaussian_cdf,params,args=(x,y))
double_gaussian(out[0],x)
plt.plot(x,two_gaussian_cdf(out[0],x))
which return to this plot

You may find lmfit (see http://lmfit.github.io/lmfit-py/) to be a useful alternative to leastsq here as it provides a higher-level interface to optimization and curve fitting (though still based on scipy.optimize.leastsq). With lmfit, your example might look like this (cutting out the definition of x and y data):
#!/usr/bin/env python
import numpy as np
from scipy.special import erf
import matplotlib.pyplot as plt
from lmfit import Model
# define the basic model. I included an amplitude parameter
def gaussian_cdf(x, amp, mu, sigma):
return (amp/2.0)*(1 + erf( (x-mu)/(sigma*np.sqrt(2))))
# create a model that is the sum of two gaussian_cdfs
# note that a prefix names each component and will be
# applied to the parameter names for each model component
model = Model(gaussian_cdf, prefix='g1_') + Model(gaussian_cdf, prefix='g2_')
# make a parameters object -- a dict with parameter names
# taken from the arguments of your model function and prefix
params = model.make_params(g1_amp=0.50, g1_mu=94, g1_sigma=1,
g2_amp=0.50, g2_mu=98, g2_sigma=1.)
# you can apply bounds to any parameter
#params['g1_sigma'].min = 0 # sigma must be > 0!
# you may want to fix the amplitudes to 0.5:
#params['g1_amp'].vary = False
#params['g2_amp'].vary = False
# run the fit
result = model.fit(y, params, x=x)
# print results
print(result.fit_report())
# plot results, including individual components
comps = result.eval_components(result.params, x=x)
plt.plot(x, y,'r.', label='data')
plt.plot(x, result.best_fit, 'k-', label='fit')
plt.plot(x, comps['g1_'], 'b--', label='g1_')
plt.plot(x, comps['g2_'], 'g--', label='g2_')
plt.legend()
plt.show()
This prints out a report of
[[Model]]
(Model(gaussian_cdf, prefix='g1_') + Model(gaussian_cdf, prefix='g2_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 66
# data points = 50
# variables = 6
chi-square = 0.00626332
reduced chi-square = 1.4235e-04
Akaike info crit = -437.253376
Bayesian info crit = -425.781238
[[Variables]]
g1_amp: 0.65818908 +/- 0.00851338 (1.29%) (init = 0.5)
g1_mu: 93.8438526 +/- 0.01623273 (0.02%) (init = 94)
g1_sigma: 0.54362156 +/- 0.02021614 (3.72%) (init = 1)
g2_amp: 0.34058664 +/- 0.01153346 (3.39%) (init = 0.5)
g2_mu: 97.7056728 +/- 0.06408910 (0.07%) (init = 98)
g2_sigma: 1.24891832 +/- 0.09204020 (7.37%) (init = 1)
[[Correlations]] (unreported correlations are < 0.100)
C(g1_amp, g2_amp) = -0.892
C(g2_amp, g2_sigma) = 0.848
C(g1_amp, g2_sigma) = -0.744
C(g1_amp, g1_mu) = 0.692
C(g1_amp, g2_mu) = 0.662
C(g1_mu, g2_amp) = -0.607
C(g1_amp, g1_sigma) = 0.571
and a plot like this:
This fit is not perfect, but it should get you started.

Here is how I used the scipy.optimize.differential_evolution module to generate initial parameter estimates for curve fitting. I have coded the sum of squared errors as the target for the genetic algorithm as shown below. This scipy module uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, which requires parameter bounds within which to search. In this case, the parameter bounds are automatically derived from the data so that there is no need to provide them manually in the code.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import warnings
from scipy.optimize import differential_evolution
from scipy.special import erf
# bounds on parameters are set in generate_Initial_Parameters() below
def two_gaussian_cdf(x, mu1, sigma1, mu2, sigma2):
model = 0.5*(1 + erf( (x-mu1)/(sigma1*np.sqrt(2)) )) +\
0.5*(1 + erf( (x-mu2)/(sigma2*np.sqrt(2)) ))
return model
# function for genetic algorithm to minimize (sum of squared error)
# bounds on parameters are set in generate_Initial_Parameters() below
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
return np.sum((yData - two_gaussian_cdf(xData, *parameterTuple)) ** 2)
def generate_Initial_Parameters():
# data min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minX, maxX]) # parameter bounds for mu1
parameterBounds.append([minY, maxY]) # parameter bounds for sigma1
parameterBounds.append([minX, maxX]) # parameter bounds for mu2
parameterBounds.append([minY, maxY]) # parameter bounds for sigma2
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
xData = np.array([ 90.64115156, 90.85690063, 91.07264971, 91.28839878,
91.50414786, 91.71989693, 91.93564601, 92.15139508,
92.36714415, 92.58289323, 92.7986423 , 93.01439138,
93.23014045, 93.44588953, 93.6616386 , 93.87738768,
94.09313675, 94.30888582, 94.5246349 , 94.74038397,
94.95613305, 95.17188212, 95.3876312 , 95.60338027,
95.81912935, 96.03487842, 96.2506275 , 96.46637657,
96.68212564, 96.89787472, 97.11362379, 97.32937287,
97.54512194, 97.76087102, 97.97662009, 98.19236917,
98.40811824, 98.62386731, 98.83961639, 99.05536546,
99.27111454, 99.48686361, 99.70261269, 99.91836176,
100.13411084, 100.34985991, 100.56560899, 100.78135806,
100.99710713, 101.21285621])
yData = np.array([3.33333333e-04, 3.33333333e-04, 3.33333333e-04, 1.00000000e-03,
1.33333333e-03, 3.33333333e-03, 6.66666667e-03, 1.30000000e-02,
2.36666667e-02, 3.40000000e-02, 5.13333333e-02, 7.36666667e-02,
1.01666667e-01, 1.38666667e-01, 2.14000000e-01, 3.31000000e-01,
4.49666667e-01, 5.50000000e-01, 6.09000000e-01, 6.36000000e-01,
6.47000000e-01, 6.54666667e-01, 6.61000000e-01, 6.67000000e-01,
6.76333333e-01, 6.84000000e-01, 6.95666667e-01, 7.10000000e-01,
7.27666667e-01, 7.50666667e-01, 7.75333333e-01, 7.93333333e-01,
8.11333333e-01, 8.31333333e-01, 8.56333333e-01, 8.81333333e-01,
9.00666667e-01, 9.22666667e-01, 9.37666667e-01, 9.47333333e-01,
9.59000000e-01, 9.70333333e-01, 9.77333333e-01, 9.83333333e-01,
9.90333333e-01, 9.93666667e-01, 9.96333333e-01, 9.99000000e-01,
9.99666667e-01, 1.00000000e+00])
# generate initial parameter values
initialParameters = generate_Initial_Parameters()
# curve fit the data
fittedParameters, niepewnosci = curve_fit(two_gaussian_cdf, xData, yData, initialParameters)
# create values for display of fitted peak function
mu1, sigma1, mu2, sigma2 = fittedParameters
y_fit = two_gaussian_cdf(xData, mu1, sigma1, mu2, sigma2)
plt.plot(xData, yData) # plot the raw data
plt.plot(xData, y_fit) # plot the equation using the fitted parameters
plt.show()
print(fittedParameters)

Related

Fitting data with a double Gaussian

I am attempting to fit some data with a double Gaussian profile. The data looks almost perfectly Gaussian, but try as I might, I can't get a fit better than a certain shape, regardless of the initial guesses I input. I've tried to use the two gaussian equations listed below, but neither fit quite right. Overall I'd like it to be flatter on the continuum (no 'wings') and have a smoother, closer fit to the actual shape if possible.
Due to the nature of the follow-up analysis, the fit needs to be a double Gaussian, as I require the fitting parameters, and thus I can't consider other fitting methods. The data can be found here:
https://docs.google.com/spreadsheets/d/1kMO2ogAL8ZCiDeY29kBvv5lzMfAD7dLj-5rKW8kW9Go/edit?usp=sharing
Below is an example of the code I've been using to try and fit the data, as well as the output figure.
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from scipy.optimize import curve_fit
from lmfit import Model
with open("data.txt","r") as f:
content=[i.strip() for i in f.readlines()]
vel=[]
I=[]
dI=[]
for i in range(8,len(content)):
line=content[i].split()
vel.append(float(line[0]))
I.append(float(line[1]))
dI.append(float(line[2]))
def gaussian(x, A, x0, sig):
return A*np.exp(-(x-x0)**2/(2*sig**2))
def gaussian2(x, amp, cen, wid):
return (amp/(np.sqrt(2*np.pi)*wid))*np.exp(-(x-cen)**2/(2*wid**2))
def multi_gaussian(x, *pars):
offset = pars[-1]
g1 = gaussian(x, pars[1], pars[0], pars[2])
g2 = gaussian(x, pars[3], pars[0], pars[4])
return g1 + g2 + offset
def multi_gaussian2(x, *pars):
offset = pars[-1]
g1 = gaussian2(x, pars[1], pars[0], pars[2])
g2 = gaussian2(x, pars[3], pars[0], pars[4])
return g1 + g2 + offset
offset=1
guess = [-15,-0.01,10,-0.01,10,1]
popt, pcov = curve_fit(multi_gaussian, vel, I, guess)
popt2, pcov2 = curve_fit(multi_gaussian2, vel, I, guess)
x=np.linspace(np.min(vel),np.max(vel), 2000)
plt.figure()
plt.scatter(vel,I,s=0.1,c='b')
plt.plot(x, multi_gaussian(x, *popt), 'r--', linewidth=1,label='Gaussian1')
plt.plot(x, multi_gaussian2(x, *popt2), 'g--', linewidth=1,label='Gaussian2')
plt.legend(loc='best')
plt.show()
The data in your linked spreadsheet only has 2 significant digits for velocity and intensity. That makes it basically impossible to try to "refine" your fit to get a better result. That said, I highly recommend using a lmfit script like this, that will include your intensity uncertainties in the fit:
import matplotlib.pyplot as plt
import numpy as np
from lmfit.models import GaussianModel, ConstantModel
data = np.loadtxt('ddata.txt', skiprows=1)
v = data[:, 0]
i = data[:, 1]
di = data[:, 2]
model = (ConstantModel(prefix='offset_') +
GaussianModel(prefix='p1_') +
GaussianModel(prefix='p2_'))
params = model.make_params(offset_c=1,
p1_amplitude=-1., p1_sigma=100, p1_center=25,
p2_amplitude=-1., p2_sigma=100, p2_center=-25)
init = model.eval(params, x=v)
result = model.fit(i, params, weights=1.0/(di+1.e-9), x=v)
print(result.fit_report())
plt.figure()
plt.scatter(v, i, s=0.5, label='data')
plt.plot(v, init, label='init')
plt.plot(v, result.best_fit, label='fit')
plt.legend()
plt.xlabel('velocity (mm/s)')
plt.ylabel('intensity')
plt.show()
For the data you supplied, this will print out a fit report like this:
[[Model]]
((Model(constant, prefix='offset_') + Model(gaussian, prefix='p1_')) + Model(gaussian, prefix='p2_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 128
# data points = 191
# variables = 7
chi-square = 654.770994
reduced chi-square = 3.55853801
Akaike info crit = 249.314315
Bayesian info crit = 272.080229
[[Variables]]
offset_c: 1.00013943 +/- 5.1045e-05 (0.01%) (init = 1)
p1_amplitude: -1.36807407 +/- 0.08677931 (6.34%) (init = -1)
p1_center: 46.8019583 +/- 3.77807981 (8.07%) (init = 25)
p1_sigma: 57.3859589 +/- 2.39823612 (4.18%) (init = 100)
p2_amplitude: -1.16999330 +/- 0.08533205 (7.29%) (init = -1)
p2_center: -76.1117581 +/- 3.49975073 (4.60%) (init = -25)
p2_sigma: 51.7080694 +/- 2.08860434 (4.04%) (init = 100)
p1_fwhm: 135.133604 +/- 5.64741436 (4.18%) == '2.3548200*p1_sigma'
p1_height: -0.00951073 +/- 2.6406e-04 (2.78%) == '0.3989423*p1_amplitude/max(2.220446049250313e-16, p1_sigma)'
p2_fwhm: 121.763196 +/- 4.91828727 (4.04%) == '2.3548200*p2_sigma'
p2_height: -0.00902683 +/- 3.5183e-04 (3.90%) == '0.3989423*p2_amplitude/max(2.220446049250313e-16, p2_sigma)'
[[Correlations]] (unreported correlations are < 0.100)
C(p1_center, p2_amplitude) = -0.967
C(p1_amplitude, p2_center) = 0.959
C(p1_center, p2_center) = 0.956
C(p1_amplitude, p2_amplitude) = -0.946
C(p1_amplitude, p1_center) = 0.943
C(p2_amplitude, p2_center) = -0.943
and a plot of

Python Scipy Curvefit to Linear Quadratic Curve

I'm trying to fit a linear quadratic model curve to experiment data. The Y axis values reduce from 1 to 10^-5. When I use the following code, the resulting curve often seems to not fit the data at higher X values. I have a suspicion that because the Y values at high X values are so small, the resulting difference between the experiment value and model value is small. But I would like the model curve to pass as close to the higher X value points as possible (even if it means the low values are not as well fitted). I haven't found anything about weighting in scipy.optimize.curve_fit, other than using standard deviations (which I don't have). How can I improve my model fit at high X values?
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
def lq(x, a, b):
#y(x) = exp[-(ax+bx²)]
y = []
for i in x:
x2=i**2
ax = a*i
bx2 = b*x2
y.append(np.exp(-(ax+bx2)))
return y
#x and y are from experiment
x=[0,1.778,2.921,3.302,6.317,9.524,10.54]
y=[1,0.831763771,0.598411595,0.656145266,0.207014135,0.016218101,0.004102041]
(a,b), pcov = curve_fit(lq, x, y, p0=[0.05,0.05])
#make the model curve using a and b
xmodel = list(range(0,20))
ymodel = lq(xmodel, a, b)
fig, ax1 = plt.subplots()
ax1.set_yscale('log')
ax1.plot(x,y, "ro", label="Experiment")
ax1.plot(xmodel,ymodel, "r--", label="Model")
plt.show()
I agree with your assessment that the fit is not very sensitive to small misfits for the small values of y. Since you are plotting the data and fit on a semi-log plot, I think that what you really want is to fit in the log-space as well. That is, you could fit log(y) to a quadratic function. As an aside (but an important one if you're going to be doing numerical work with Python), you should not loop over lists but rather use numpy arrays: this will make everything faster and simpler. With such changes, your script might look like
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
def lq(x, a, b):
return -(a*x+b*x*x)
x = np.array([0,1.778,2.921,3.302,6.317,9.524,10.54])
y = np.array([1,0.831763771,0.598411595,0.656145266,0.207014135,0.016218101,0.004102041])
(a,b), pcov = curve_fit(lq, x, np.log(y), p0=[0.05,0.05])
xmodel = np.arange(20) # Note: use numpy!
ymodel = np.exp(lq(xmodel, a, b)) # Note: take exp() as inverse log()
fig, ax1 = plt.subplots()
ax1.set_yscale('log')
ax1.plot(x, y, "ro", label="Experiment")
ax1.plot(xmodel,ymodel, "r--", label="Model")
plt.show()
Note that the model function is changed to just be the ax+bx^2 you wanted to write in the first place and that this is now fitting np.log(y), not y. This will give a much more satisfying fit at the smaller y values.
You might also find lmfit (https://lmfit.github.io/lmfit-py/) helpful for this problem (disclaimer: I am a lead author). With this, your fit script could become
from lmfit import Model
model = Model(lq)
params = model.make_params(a=0.05, b=0.05)
result = model.fit(np.log(y), params, x=x)
print(result.fit_report())
xmodel = np.arange(20)
ymodel = np.exp(result.eval(x=xmodel))
plt.plot(x, y, "ro", label="Experiment")
plt.plot(xmodel, ymodel, "r--", label="Model")
plt.yscale('log')
plt.legend()
plt.show()
This will print out a report including fit statistics and interpretable uncertainties and correlations between variables:
[[Model]]
Model(lq)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 7
# data points = 7
# variables = 2
chi-square = 0.16149397
reduced chi-square = 0.03229879
Akaike info crit = -22.3843833
Bayesian info crit = -22.4925630
[[Variables]]
a: -0.05212688 +/- 0.04406602 (84.54%) (init = 0.05)
b: 0.05274458 +/- 0.00479056 (9.08%) (init = 0.05)
[[Correlations]] (unreported correlations are < 0.100)
C(a, b) = -0.968
and give a plot of
Note that lmfit Parameters can be fixed or bounded and that lmfit comes with many built-in models.
Finally, if you were to include a constant term in the quadratic model, you would not really need an iterative method but could use polynomial regression, as with numpy.polyfit.
Here is a graphical Python fitter using your data with a Gompertz type of sigmoidal equation. This code uses scipy's Differential Evolution genetic algorithm module to determine initial parameter estimates for scipy's non-linear curve_fit() routine. That scipy module uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, requiring bounds within which to search. In this example, I made all of the parameter search bounds from -2.0 to 2.0, and that seems to work in this case. Note that it is much easier to provide ranges for the initial parameter estimates than specific values, and those parameter ranges can be generous.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
#x and y are from experiment
x=[0,1.778,2.921,3.302,6.317,9.524,10.54]
y=[1,0.831763771,0.598411595,0.656145266,0.207014135,0.016218101,0.004102041]
# alias data to match previous example code
xData = numpy.array(x, dtype=float)
yData = numpy.array(y, dtype=float)
def func(x, a, b, c): # Sigmoidal Gompertz C from zunzun.com
return a * numpy.exp(b * numpy.exp(c*x))
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
parameterBounds = []
parameterBounds.append([-2.0, 2.0]) # search bounds for a
parameterBounds.append([-2.0, 2.0]) # search bounds for b
parameterBounds.append([-2.0, 2.0]) # search bounds for c
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# plot wuth log Y axis scaling
plt.yscale('log')
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

How to determine by which model my curve should be and approximate it?

I have a curve: enter image description here
and its digitized data. https://drive.google.com/open?id=1ZB39G3SmtamjVjmLzkC2JefloZ9iShpO
How should I choose a suitable function for the least squares method and how this approximation can be implemented on Python?
I tried to do it like that
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import numpy as np
import sympy as sym
x = np.loadtxt("x_data.txt", delimiter='\t', dtype=np.float)
y = np.loadtxt("y_data.txt", delimiter='\t', dtype=np.float)
plt.plot(x, y, 'ro',label="Original Data")
x = np.array(x, dtype=float)
y = np.array(y, dtype=float)
def func(x, a, b, c, d):
return a*x**3 + b*x**2 +c*x + d
popt, pcov = curve_fit(func, x, y)
xs = sym.Symbol('\lambda')
tex = sym.latex(func(xs,*popt)).replace('$', '')
plt.title(r'$f(\lambda)= %s$' %(tex),fontsize=16)
plt.plot(x, func(x, *popt), label="Fitted Curve")
plt.legend(loc='upper left')
plt.show()
thanks
I extracted data from your plot for analysis, and here is my first cut at the problem. Since your plot used decade-log scaling, I took the anti-decade-log of the "left-side" Y data for fitting. My peak equation search on this "extracted" data turned up a log-normal type of equation, and here is a graphical python fitter reading in data files and fitting to this equation. This fitter uses the scipy differential_evolution genetic algorithm module to determine initial parameter estimates for the non-linear fitter, which requires parameter ranges within which to search. In this code the data maximum and minimum values are used along with my range estimates. It is much easier to estimate ranges on initial parameter values than specific values. This fitter should be able to directly read your data files. If you can post or link to the actual data I might be able to make a better fit than is shown here.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
with open('./x_data.txt', 'rt') as f:
x_file = f.read()
with open('./y_data.txt', 'rt') as f:
y_file = f.read()
xlist = []
for line in x_file.split('\n'):
if line: # this allows blank lines in file
xlist.append(float(line.strip()))
ylist = []
for line in y_file.split('\n'):
if line: # this allows blank lines in file
ylist.append(float(line.strip()))
if len(xlist) != len(ylist):
print(len(xlist), len(ylist))
raise Exception('X and Y habe different length')
xData = numpy.array(xlist)
yData = numpy.array(ylist)
def func(t, a, b, c, d): # Log-Normal Peak A Shifted from zunzun.com
return a * numpy.exp(-0.5 * numpy.power((numpy.log(t-d)-b) / c, 2.0))
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([0.0, 2.0]) # search bounds for b
parameterBounds.append([-1.0, 0.0]) # search bounds for c
parameterBounds.append([-maxX, 0.0]) # search bounds for d
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData), 1000)
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
EDIT: use actual data
With the actual data now available I found that a Pulse peak equation gives a good fit to the data, here is an updated fitter. I recommend taking additional data from time 0 to 5 if possible, this will yield data that better characterizes the region at the beginning of the peak.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
with open('./x_data.txt', 'rt') as f:
x_file = f.read()
with open('./y_data.txt', 'rt') as f:
y_file = f.read()
xlist = []
for line in x_file.split('\n'):
if line: # this allows blank lines in file
xlist.append(float(line.strip()))
ylist = []
for line in y_file.split('\n'):
if line: # this allows blank lines in file
ylist.append(float(line.strip()))
if len(xlist) != len(ylist):
print(len(xlist), len(ylist))
raise Exception('X and Y have different length')
xData = numpy.array(xlist)
yData = numpy.array(ylist)
def func(t, a, b, c, Offset): # Pulse Peak With Offset from zunzun.com
return 4.0 * a * numpy.exp(-1.0 * (t-b) / c) * (1.0 - numpy.exp(-1.0 * (t-b) / c)) + Offset
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([-5.0, 0.0]) # search bounds for b
parameterBounds.append([1.0, 10.0]) # search bounds for c
parameterBounds.append([minY, maxY]) # search bounds for Offset
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData), 1000)
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Error in Scipy curve fit for more than two parameters

I am quite new to Scipy. I have a data file (https://www.dropbox.com/s/mwz8s2kap2mnwo0/data.dat?dl=0) and want to fit the function aexp(bx^c). The problem is when I give manually the value of c (say c = 0.75), the code works perfectly, but if I want to find the 'a', 'b' and 'c' from the fit, the code does not work and producing a flat line. Sorry if the problem is too silly. The code reads as:
import numpy as np
from scipy.optimize import curve_fit
import sys
import matplotlib.pyplot as plt
import math as math
filename = sys.argv[1]
data = np.loadtxt(filename)
x = np.array(data[:,0])
y = np.array(data[:,1])
def func(x, a, b, c):
return a*np.exp(b*x**c)
params = curve_fit(func, x, y)
[a, b, c] = params[0]
perr = np.sqrt(np.diag(params[1]))
x_new = []
y_new = []
for i in np.linspace(1.00003e-05, 0.10303175629999914, num=1000):
j = func(i, a, b, c)
x_new.append(i)
y_new.append(j)
x1 = np.array(x_new)
y1 = np.array(y_new)
print ("a = ", a, "error = ", perr[0], "error % = ", (perr[0]/a)*100, '\t' "b = ", b, "error = ", perr[1], "error % = ", (perr[1]/b)*100), '\t' "c = ", c, "error = ", perr[2], "error % = ", (perr[2]/c)*100,
#np.savetxt('fit.dat', np.c_[x1, y1])
plt.plot(x, y, label='data')
plt.plot(x1, y1, label = 'a*np.exp(b*x**c)')
plt.xlabel('Time(s)')
plt.ylabel('SRO')
plt.legend()
plt.show()
Exponential equations can be quite sensitive to the non-linear solver's initial parameter estimates. By default, many non-linear solvers - including scipy's curve_fit - use default initial parameter values of 1.0 for these initial parameter estimates if none are supplied, and in this particular case those values were not good initial estimates for your combination of data and equation. Scipy does include a genetic algorithm which can be used to determine the initial parameter estimates, and their implementation requires bounds within which to search. Here is an example graphical solver using the scipy differential_evolution genetic algorithm module for this purpose, note the ranges that I have used for the genetic algorithm to search within. It is much easier to give ranges for the parameters in this way rather than explicit values, though this is not always true it worked here. You will need to change the file path that I used to load the data.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
filename = '/home/zunzun/Downloads/data.dat'
data = numpy.loadtxt(filename)
xData = numpy.array(data[:,0])
yData = numpy.array(data[:,1])
def func(x, a, b, c):
return a*numpy.exp(b*x**c)
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
minData = min(minX, minY)
maxData = min(maxX, maxY)
parameterBounds = []
parameterBounds.append([-maxData * 10.0, maxData * 10.0]) # search bounds for a
parameterBounds.append([-maxData * 10.0, maxData * 10.0]) # search bounds for b
parameterBounds.append([-maxData * 10.0, maxData * 10.0]) # search bounds for c
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Linear regression with intercept forced to zero AND uncertainty on value of slope

I want to do a linear regression with python with two requirements:
intercept forced to zero
in the output I would like to have uncertainty on the slope parameter, as well as p-value, r-squared...
As far as I know, stats.linregress does the first requirement, and np.linalg.lstsq does the second. Can someone help me find the easiest way to do this please?
Thank you very much,
Camille
This example has the statistics requested in your question, and also plots the fitted function vs. the data.
from scipy.optimize import curve_fit
import numpy as np
import scipy.odr
import scipy.stats
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
xData = np.array([5.357, 5.797, 5.936, 6.161, 6.697, 6.731, 6.775, 8.442, 9.861])
yData = np.array([0.376, 0.874, 1.049, 1.327, 2.054, 2.077, 2.138, 4.744, 7.104])
def func(x,b0):
return b0 * x
initialParameters = numpy.array([np.mean(yData) / np.mean(xData)])
def f_wrapper_for_odr(beta, x): # parameter order for odr
return func(x, *beta)
fittedParameters, cov= curve_fit(func, xData, yData, p0=initialParameters)
model = scipy.odr.odrpack.Model(f_wrapper_for_odr)
data = scipy.odr.odrpack.Data(xData, yData)
myodr = scipy.odr.odrpack.ODR(data, model, beta0=fittedParameters, maxit=0)
myodr.set_job(fit_type=2)
fittedParameterstatistics = myodr.run()
df_e = len(xData) - len(fittedParameters) # degrees of freedom, error
cov_beta = fittedParameterstatistics.cov_beta # parameter covariance matrix from ODR
sd_beta = fittedParameterstatistics.sd_beta * fittedParameterstatistics.sd_beta
ci = []
t_df = scipy.stats.t.ppf(0.975, df_e)
ci = []
for i in range(len(fittedParameters)):
ci.append([fittedParameters[i] - t_df * fittedParameterstatistics.sd_beta[i], fittedParameters[i] + t_df * fittedParameterstatistics.sd_beta[i]])
tstat_beta = fittedParameters / fittedParameterstatistics.sd_beta # coeff t-statistics
pstat_beta = (1.0 - scipy.stats.t.cdf(np.abs(tstat_beta), df_e)) * 2.0 # coef. p-values
for i in range(len(fittedParameters)):
print('parameter:', fittedParameters[i])
print(' conf interval:', ci[i][0], ci[i][1])
print(' tstat:', tstat_beta[i])
print(' pstat:', pstat_beta[i])
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
import numpy as np
from scipy.optimize import curve_fit
xdata = np.array([x values])
ydata = np.array([y values])
def func(x, a):
return a*x
popt, pcov = curve_fit(func, xdata, ydata)
residuals = ydata- func(xdata, *popt)
ss_res = np.sum(residuals**2)
ss_tot = np.sum((ydata-np.mean(ydata))**2)
r_squared = 1 - (ss_res / ss_tot)
dgr_free = len(xdata)-1
chi_sqr = sum([(y-func(x,*popt))**2/func(x,*popt) for x,y in zip(xdata,ydata)])
print(popt) # will print out your varibles in order, in this case just a
print(r_squared)
print(chi_sqr,dgr_free) # btw this is chi squared not p
the idear here is that we make a regression of the lieaner function without + b since b move the y axis intercept up and down, thus when that is et to 0 we get a linear regresion with intercept in (0,0)
A benefit to use scipy.curve_fit is also that you can make the regression for any formula - though r_squared are some what reduntant in curved regressions.

Resources