How to create synthetic data for a decaying curve in order to extrapolate it beyond some point? - python-3.x

In the following curve, I would like to extend the measurements beyond x=1 in order to have a better estimate of the green curve compared to red line.
Note: I do not have the analytical form of the function but only x, y data sets in the range (0, 1).
Is there any package in python in order to extrapolate a curve beyond some value given that we have the interpolated form of the curve?
Here is my attempt assuming a linear drop:
from scipy.interpolate import interp1d
import numpy as np
CURVE_CUT_INDEX = #the index corresponding to x=1 in the x array
def extrapolator_function(x_vals, y_vals, x_list):
interpolator = interp1d(x_vals, y_vals, kind='cubic')
x_1 = x_vals[-1]
y_1 = interpolator(x_1)
y_grad, x_grad = (np.gradient(y_vals, np.arange(y_vals.size)),
np.gradient(x_vals, np.arange(x_vals.size)))
slope = np.divide(y_grad, x_grad, out=np.zeros_like(y_grad), where=x_grad != 0)[-1]
x_out = x_list[CURVE_CUT_INDEX + 1:]
y_pred = np.array([slope * (x-x_1) + y_1 for x in x_out])
return x_vals, y_vals, x_out, y_pred
def plotter(ax, x_list, y_list):
x_vals, y_vals = x_list[0:CURVE_CUT_INDEX + 1], y_list(x_list)[0:CURVE_CUT_INDEX + 1]
x_vals, y_vals, x_out, y_pred = extrapolator_function(x_vals, y_vals, x_list)
return ax.plot(x_vals, y_vals, 'g-', x_out, y_pred, 'r-', alpha=1, lw=2)
which will result in the following extrapolation scheme (which is not what I want).

Related

Python: Fitting a piecewise polynomial

I am trying to fit a piecewise polynomial function
Code:
import numpy as np
import scipy
from scipy.interpolate import UnivariateSpline, splrep
from scipy.optimize import curve_fit
from matplotlib import pyplot as plt
def piecewise_func(x, X, Y):
"""
cond_l: condition list
func_l: function list
"""
spl = UnivariateSpline(X, Y, k=3, s=0.5)
tck = (spl._data[8], spl._data[9], 3) # tck = (knots, coefficients, degree)
p = scipy.interpolate.PPoly.from_spline(tck)
cond_l = []
func_l = []
for idx, i in enumerate(range(3, len(spl.get_knots()) + 3 - 1)):
cond_l.append([(x >= p.x[i] & x < p.x[i + 1])])
func_l.append([lambda x: p.c[3, i] + p.c[2, i] * x + p.c[1, i] * x ** 2 + p.c[0, i] * x ** 3])
return np.piecewise(x, cond_l, func_l)
if __name__ == '__main__':
xdata = [0.28190937, 0.63429607, 0.91620544, 1.68793236, 2.32350115, 2.95215219, 4.5,
4.78103382, 7.2, 7.53430054, 8.03627018, 9., 9.86212529, 11.25951191, 11.62658532, 11.65598578, 13.90295926]
ydata = [0.36273168, 0.81614628, 1.17887796, 1.4475374, 5.52692706, 2.17548169, 3.55313396, 3.80326533, 7.75556311, 8.30176616, 10.72117182, 11.2499386,
11.72296513, 11.02146624, 14.51260631, 20.59365525, 21.77847853]
spl = UnivariateSpline(xdata, ydata, k=3, s=1)
plt.plot(xdata, ydata, '*')
plt.plot(xdata, spl(xdata))
plt.show()
p, e = curve_fit(piecewise_func, xdata, ydata)
# x_plot = np.linspace(0., 0.15, len(x))
# plt.plot(x, y, "+")
# plt.plot(x, (piecewise_func(x_plot, *p)), 'C3-', lw=3)
I tried the UnivariateSpline function to interpolate, I see the following result
However, I don't want the polynomial curve to pass through all data points. I tried varying the smoothing factor but I am not able to obtain something like the one below.
Expected output:
I'm trying curve fitting (Use UnivariateSpline to fit data tightly) to get the expected output and I have the following issues.
piecewise_func in the code posted returns the piecewise polynomial.
Passing this to curve_fit(piecewise_func, xdata, ydata) returns an error
Error:
res = leastsq(func, p0, Dfun=jac, full_output=1, **kwargs)
ValueError: diff requires input that is at least one dimensional
I am not sure what is wrong.
Suggestions on how to get the expected fit will be
of great help.
I would recommend having a closer look at the parameter s in the UnivariateSpline documentation:
s : float or None, optional
Positive smoothing factor used to choose the number of knots. Number of knots will be increased until the smoothing condition is satisfied:
sum((w[i] * (y[i]-spl(x[i])))**2, axis=0) <= s
If s is None, s = len(w) which should be a good value if 1/w[i] is an estimate of the standard deviation of y[i]. If 0, spline will interpolate through all data points. Default is None.
Since you do not set w, this is just a complicated way of saying that s is the least squares error that you allow, i.e., squared errors summed over all the data points. Your value of 1 does not lead to interpolation but it is quite tight compared to what you want to achieve.
Taking
spl = UnivariateSpline(xdata, ydata, k=3, s=10)
you get the following:
Yet closer to your goal is s=100:
So my recommendation is to play around with s and if that proves insufficient, to ask a new question describing what you need more precisely. I haven't had a proper look at the problem with piecewise_func.

How to compute the distance of data points to decision boundary when using the EllipticEnvelope of sklearn?

How can I compute the euclidean distance to the boundary decision of the EllipticEnvelope? Here is my code :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
feature, output = "temperature", "consumption"
data = pd.DataFrame(np.random.normal(0,15, size=(2355,2)), columns=[feature, output])
X = data[[feature, output]]
X_train, X_test = train_test_split(X, shuffle=True, train_size=0.8)
model = EllipticEnvelope(contamination=0.18)
model.fit(X_train)
# extract the model predictions
y_pred = pd.Series(model.predict(X), index=X.index, name="anomaly")
# define the meshgrid : X = (u,v).T
u_min, u_max = X_train.iloc[:, 0].min() - 1.5, X_train.iloc[:, 0].max() + 1.5
v_min, v_max = X_train.iloc[:, 1].min() - 1.5, X_train.iloc[:, 1].max() + 1.5
n_points = 500
u = np.linspace(u_min, u_max, n_points)
v = np.linspace(v_min, v_max, n_points)
U, V = np.meshgrid(u, v)
# evaluate the decision function on the meshgrid
W = model.decision_function(np.c_[U.ravel(), V.ravel()])
W = W.reshape(U.shape)
plt.figure(figsize=(20,6))
a = plt.contour(U, V, W, levels=[0], linewidths=2, colors="black")
b = plt.scatter(X.loc[y_pred == 1].iloc[:, 0], X.loc[y_pred == 1].iloc[:, 1], c="yellowgreen", edgecolors='k')
c = plt.scatter(X.loc[y_pred == -1].iloc[:, 0], X.loc[y_pred == -1].iloc[:, 1], c="tomato", edgecolors='k')
plt.legend([a.collections[0], b, c], ['learned frontier', 'regular observations', 'abnormal observations'], bbox_to_anchor=(1.05, 1))
plt.axis('tight')
plt.show()
Edits
I am able to get the decision boundary points using the following code. Now, the problem can be solved by computing numerically the distance.
for item in a.collections:
for i in item.get_paths():
v = i.vertices
x = v[:, 0]
y = v[:, 1]
I have an obvious solution. Getting all data points d and compute the euclidean distance between d and e=(x,y). But, it is a brute-force technique.. :D I will continue my research !
Another solution would be to fit an ellipse and compute the distance using the formula described by #epiliam there : https://math.stackexchange.com/questions/3670465/calculate-distance-from-point-to-ellipse-edge
I will provide one solution tomorrow based on the brute-force. It seems to work well for small dataset (n_rows < 10000). I did not test for larger ones.

Fitting data with a double Gaussian

I am attempting to fit some data with a double Gaussian profile. The data looks almost perfectly Gaussian, but try as I might, I can't get a fit better than a certain shape, regardless of the initial guesses I input. I've tried to use the two gaussian equations listed below, but neither fit quite right. Overall I'd like it to be flatter on the continuum (no 'wings') and have a smoother, closer fit to the actual shape if possible.
Due to the nature of the follow-up analysis, the fit needs to be a double Gaussian, as I require the fitting parameters, and thus I can't consider other fitting methods. The data can be found here:
https://docs.google.com/spreadsheets/d/1kMO2ogAL8ZCiDeY29kBvv5lzMfAD7dLj-5rKW8kW9Go/edit?usp=sharing
Below is an example of the code I've been using to try and fit the data, as well as the output figure.
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from scipy.optimize import curve_fit
from lmfit import Model
with open("data.txt","r") as f:
content=[i.strip() for i in f.readlines()]
vel=[]
I=[]
dI=[]
for i in range(8,len(content)):
line=content[i].split()
vel.append(float(line[0]))
I.append(float(line[1]))
dI.append(float(line[2]))
def gaussian(x, A, x0, sig):
return A*np.exp(-(x-x0)**2/(2*sig**2))
def gaussian2(x, amp, cen, wid):
return (amp/(np.sqrt(2*np.pi)*wid))*np.exp(-(x-cen)**2/(2*wid**2))
def multi_gaussian(x, *pars):
offset = pars[-1]
g1 = gaussian(x, pars[1], pars[0], pars[2])
g2 = gaussian(x, pars[3], pars[0], pars[4])
return g1 + g2 + offset
def multi_gaussian2(x, *pars):
offset = pars[-1]
g1 = gaussian2(x, pars[1], pars[0], pars[2])
g2 = gaussian2(x, pars[3], pars[0], pars[4])
return g1 + g2 + offset
offset=1
guess = [-15,-0.01,10,-0.01,10,1]
popt, pcov = curve_fit(multi_gaussian, vel, I, guess)
popt2, pcov2 = curve_fit(multi_gaussian2, vel, I, guess)
x=np.linspace(np.min(vel),np.max(vel), 2000)
plt.figure()
plt.scatter(vel,I,s=0.1,c='b')
plt.plot(x, multi_gaussian(x, *popt), 'r--', linewidth=1,label='Gaussian1')
plt.plot(x, multi_gaussian2(x, *popt2), 'g--', linewidth=1,label='Gaussian2')
plt.legend(loc='best')
plt.show()
The data in your linked spreadsheet only has 2 significant digits for velocity and intensity. That makes it basically impossible to try to "refine" your fit to get a better result. That said, I highly recommend using a lmfit script like this, that will include your intensity uncertainties in the fit:
import matplotlib.pyplot as plt
import numpy as np
from lmfit.models import GaussianModel, ConstantModel
data = np.loadtxt('ddata.txt', skiprows=1)
v = data[:, 0]
i = data[:, 1]
di = data[:, 2]
model = (ConstantModel(prefix='offset_') +
GaussianModel(prefix='p1_') +
GaussianModel(prefix='p2_'))
params = model.make_params(offset_c=1,
p1_amplitude=-1., p1_sigma=100, p1_center=25,
p2_amplitude=-1., p2_sigma=100, p2_center=-25)
init = model.eval(params, x=v)
result = model.fit(i, params, weights=1.0/(di+1.e-9), x=v)
print(result.fit_report())
plt.figure()
plt.scatter(v, i, s=0.5, label='data')
plt.plot(v, init, label='init')
plt.plot(v, result.best_fit, label='fit')
plt.legend()
plt.xlabel('velocity (mm/s)')
plt.ylabel('intensity')
plt.show()
For the data you supplied, this will print out a fit report like this:
[[Model]]
((Model(constant, prefix='offset_') + Model(gaussian, prefix='p1_')) + Model(gaussian, prefix='p2_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 128
# data points = 191
# variables = 7
chi-square = 654.770994
reduced chi-square = 3.55853801
Akaike info crit = 249.314315
Bayesian info crit = 272.080229
[[Variables]]
offset_c: 1.00013943 +/- 5.1045e-05 (0.01%) (init = 1)
p1_amplitude: -1.36807407 +/- 0.08677931 (6.34%) (init = -1)
p1_center: 46.8019583 +/- 3.77807981 (8.07%) (init = 25)
p1_sigma: 57.3859589 +/- 2.39823612 (4.18%) (init = 100)
p2_amplitude: -1.16999330 +/- 0.08533205 (7.29%) (init = -1)
p2_center: -76.1117581 +/- 3.49975073 (4.60%) (init = -25)
p2_sigma: 51.7080694 +/- 2.08860434 (4.04%) (init = 100)
p1_fwhm: 135.133604 +/- 5.64741436 (4.18%) == '2.3548200*p1_sigma'
p1_height: -0.00951073 +/- 2.6406e-04 (2.78%) == '0.3989423*p1_amplitude/max(2.220446049250313e-16, p1_sigma)'
p2_fwhm: 121.763196 +/- 4.91828727 (4.04%) == '2.3548200*p2_sigma'
p2_height: -0.00902683 +/- 3.5183e-04 (3.90%) == '0.3989423*p2_amplitude/max(2.220446049250313e-16, p2_sigma)'
[[Correlations]] (unreported correlations are < 0.100)
C(p1_center, p2_amplitude) = -0.967
C(p1_amplitude, p2_center) = 0.959
C(p1_center, p2_center) = 0.956
C(p1_amplitude, p2_amplitude) = -0.946
C(p1_amplitude, p1_center) = 0.943
C(p2_amplitude, p2_center) = -0.943
and a plot of

Error in Scipy curve fit for more than two parameters

I am quite new to Scipy. I have a data file (https://www.dropbox.com/s/mwz8s2kap2mnwo0/data.dat?dl=0) and want to fit the function aexp(bx^c). The problem is when I give manually the value of c (say c = 0.75), the code works perfectly, but if I want to find the 'a', 'b' and 'c' from the fit, the code does not work and producing a flat line. Sorry if the problem is too silly. The code reads as:
import numpy as np
from scipy.optimize import curve_fit
import sys
import matplotlib.pyplot as plt
import math as math
filename = sys.argv[1]
data = np.loadtxt(filename)
x = np.array(data[:,0])
y = np.array(data[:,1])
def func(x, a, b, c):
return a*np.exp(b*x**c)
params = curve_fit(func, x, y)
[a, b, c] = params[0]
perr = np.sqrt(np.diag(params[1]))
x_new = []
y_new = []
for i in np.linspace(1.00003e-05, 0.10303175629999914, num=1000):
j = func(i, a, b, c)
x_new.append(i)
y_new.append(j)
x1 = np.array(x_new)
y1 = np.array(y_new)
print ("a = ", a, "error = ", perr[0], "error % = ", (perr[0]/a)*100, '\t' "b = ", b, "error = ", perr[1], "error % = ", (perr[1]/b)*100), '\t' "c = ", c, "error = ", perr[2], "error % = ", (perr[2]/c)*100,
#np.savetxt('fit.dat', np.c_[x1, y1])
plt.plot(x, y, label='data')
plt.plot(x1, y1, label = 'a*np.exp(b*x**c)')
plt.xlabel('Time(s)')
plt.ylabel('SRO')
plt.legend()
plt.show()
Exponential equations can be quite sensitive to the non-linear solver's initial parameter estimates. By default, many non-linear solvers - including scipy's curve_fit - use default initial parameter values of 1.0 for these initial parameter estimates if none are supplied, and in this particular case those values were not good initial estimates for your combination of data and equation. Scipy does include a genetic algorithm which can be used to determine the initial parameter estimates, and their implementation requires bounds within which to search. Here is an example graphical solver using the scipy differential_evolution genetic algorithm module for this purpose, note the ranges that I have used for the genetic algorithm to search within. It is much easier to give ranges for the parameters in this way rather than explicit values, though this is not always true it worked here. You will need to change the file path that I used to load the data.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
filename = '/home/zunzun/Downloads/data.dat'
data = numpy.loadtxt(filename)
xData = numpy.array(data[:,0])
yData = numpy.array(data[:,1])
def func(x, a, b, c):
return a*numpy.exp(b*x**c)
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
minData = min(minX, minY)
maxData = min(maxX, maxY)
parameterBounds = []
parameterBounds.append([-maxData * 10.0, maxData * 10.0]) # search bounds for a
parameterBounds.append([-maxData * 10.0, maxData * 10.0]) # search bounds for b
parameterBounds.append([-maxData * 10.0, maxData * 10.0]) # search bounds for c
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Fit CDF with 2 Gaussian using LeastSq

I am trying to fit empirical CDF plot to two Gaussian cdf as it seems that it has two peaks, but it does not work. I fit the curve with leastsq from scipy.optimize and erf function from scipy.special. The fitting only gives constant line at a value of 2. I am not sure in which part of the code that I make mistake. Any pointers will be helpful. Thanks!
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
x = np.array([ 90.64115156, 90.85690063, 91.07264971, 91.28839878,
91.50414786, 91.71989693, 91.93564601, 92.15139508,
92.36714415, 92.58289323, 92.7986423 , 93.01439138,
93.23014045, 93.44588953, 93.6616386 , 93.87738768,
94.09313675, 94.30888582, 94.5246349 , 94.74038397,
94.95613305, 95.17188212, 95.3876312 , 95.60338027,
95.81912935, 96.03487842, 96.2506275 , 96.46637657,
96.68212564, 96.89787472, 97.11362379, 97.32937287,
97.54512194, 97.76087102, 97.97662009, 98.19236917,
98.40811824, 98.62386731, 98.83961639, 99.05536546,
99.27111454, 99.48686361, 99.70261269, 99.91836176,
100.13411084, 100.34985991, 100.56560899, 100.78135806,
100.99710713, 101.21285621])
y = np.array([3.33333333e-04, 3.33333333e-04, 3.33333333e-04, 1.00000000e-03,
1.33333333e-03, 3.33333333e-03, 6.66666667e-03, 1.30000000e-02,
2.36666667e-02, 3.40000000e-02, 5.13333333e-02, 7.36666667e-02,
1.01666667e-01, 1.38666667e-01, 2.14000000e-01, 3.31000000e-01,
4.49666667e-01, 5.50000000e-01, 6.09000000e-01, 6.36000000e-01,
6.47000000e-01, 6.54666667e-01, 6.61000000e-01, 6.67000000e-01,
6.76333333e-01, 6.84000000e-01, 6.95666667e-01, 7.10000000e-01,
7.27666667e-01, 7.50666667e-01, 7.75333333e-01, 7.93333333e-01,
8.11333333e-01, 8.31333333e-01, 8.56333333e-01, 8.81333333e-01,
9.00666667e-01, 9.22666667e-01, 9.37666667e-01, 9.47333333e-01,
9.59000000e-01, 9.70333333e-01, 9.77333333e-01, 9.83333333e-01,
9.90333333e-01, 9.93666667e-01, 9.96333333e-01, 9.99000000e-01,
9.99666667e-01, 1.00000000e+00])
plt.plot(a,b,'r.')
# Fitting with 2 Gaussian
from scipy.special import erf
from scipy.optimize import leastsq
def two_gaussian_cdf(params, x):
(mu1, sigma1, mu2, sigma2) = params
model = 0.5*(1 + erf( (x-mu1)/(sigma1*np.sqrt(2)) )) +\
0.5*(1 + erf( (x-mu2)/(sigma2*np.sqrt(2)) ))
return model
def residual_two_gaussian_cdf(params, x, y):
model = double_gaussian(params, x)
return model - y
params = [5.,2.,1.,2.]
out = leastsq(residual_two_gaussian_cdf,params,args=(x,y))
double_gaussian(out[0],x)
plt.plot(x,two_gaussian_cdf(out[0],x))
which return to this plot
You may find lmfit (see http://lmfit.github.io/lmfit-py/) to be a useful alternative to leastsq here as it provides a higher-level interface to optimization and curve fitting (though still based on scipy.optimize.leastsq). With lmfit, your example might look like this (cutting out the definition of x and y data):
#!/usr/bin/env python
import numpy as np
from scipy.special import erf
import matplotlib.pyplot as plt
from lmfit import Model
# define the basic model. I included an amplitude parameter
def gaussian_cdf(x, amp, mu, sigma):
return (amp/2.0)*(1 + erf( (x-mu)/(sigma*np.sqrt(2))))
# create a model that is the sum of two gaussian_cdfs
# note that a prefix names each component and will be
# applied to the parameter names for each model component
model = Model(gaussian_cdf, prefix='g1_') + Model(gaussian_cdf, prefix='g2_')
# make a parameters object -- a dict with parameter names
# taken from the arguments of your model function and prefix
params = model.make_params(g1_amp=0.50, g1_mu=94, g1_sigma=1,
g2_amp=0.50, g2_mu=98, g2_sigma=1.)
# you can apply bounds to any parameter
#params['g1_sigma'].min = 0 # sigma must be > 0!
# you may want to fix the amplitudes to 0.5:
#params['g1_amp'].vary = False
#params['g2_amp'].vary = False
# run the fit
result = model.fit(y, params, x=x)
# print results
print(result.fit_report())
# plot results, including individual components
comps = result.eval_components(result.params, x=x)
plt.plot(x, y,'r.', label='data')
plt.plot(x, result.best_fit, 'k-', label='fit')
plt.plot(x, comps['g1_'], 'b--', label='g1_')
plt.plot(x, comps['g2_'], 'g--', label='g2_')
plt.legend()
plt.show()
This prints out a report of
[[Model]]
(Model(gaussian_cdf, prefix='g1_') + Model(gaussian_cdf, prefix='g2_'))
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 66
# data points = 50
# variables = 6
chi-square = 0.00626332
reduced chi-square = 1.4235e-04
Akaike info crit = -437.253376
Bayesian info crit = -425.781238
[[Variables]]
g1_amp: 0.65818908 +/- 0.00851338 (1.29%) (init = 0.5)
g1_mu: 93.8438526 +/- 0.01623273 (0.02%) (init = 94)
g1_sigma: 0.54362156 +/- 0.02021614 (3.72%) (init = 1)
g2_amp: 0.34058664 +/- 0.01153346 (3.39%) (init = 0.5)
g2_mu: 97.7056728 +/- 0.06408910 (0.07%) (init = 98)
g2_sigma: 1.24891832 +/- 0.09204020 (7.37%) (init = 1)
[[Correlations]] (unreported correlations are < 0.100)
C(g1_amp, g2_amp) = -0.892
C(g2_amp, g2_sigma) = 0.848
C(g1_amp, g2_sigma) = -0.744
C(g1_amp, g1_mu) = 0.692
C(g1_amp, g2_mu) = 0.662
C(g1_mu, g2_amp) = -0.607
C(g1_amp, g1_sigma) = 0.571
and a plot like this:
This fit is not perfect, but it should get you started.
Here is how I used the scipy.optimize.differential_evolution module to generate initial parameter estimates for curve fitting. I have coded the sum of squared errors as the target for the genetic algorithm as shown below. This scipy module uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, which requires parameter bounds within which to search. In this case, the parameter bounds are automatically derived from the data so that there is no need to provide them manually in the code.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import warnings
from scipy.optimize import differential_evolution
from scipy.special import erf
# bounds on parameters are set in generate_Initial_Parameters() below
def two_gaussian_cdf(x, mu1, sigma1, mu2, sigma2):
model = 0.5*(1 + erf( (x-mu1)/(sigma1*np.sqrt(2)) )) +\
0.5*(1 + erf( (x-mu2)/(sigma2*np.sqrt(2)) ))
return model
# function for genetic algorithm to minimize (sum of squared error)
# bounds on parameters are set in generate_Initial_Parameters() below
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
return np.sum((yData - two_gaussian_cdf(xData, *parameterTuple)) ** 2)
def generate_Initial_Parameters():
# data min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minX, maxX]) # parameter bounds for mu1
parameterBounds.append([minY, maxY]) # parameter bounds for sigma1
parameterBounds.append([minX, maxX]) # parameter bounds for mu2
parameterBounds.append([minY, maxY]) # parameter bounds for sigma2
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
xData = np.array([ 90.64115156, 90.85690063, 91.07264971, 91.28839878,
91.50414786, 91.71989693, 91.93564601, 92.15139508,
92.36714415, 92.58289323, 92.7986423 , 93.01439138,
93.23014045, 93.44588953, 93.6616386 , 93.87738768,
94.09313675, 94.30888582, 94.5246349 , 94.74038397,
94.95613305, 95.17188212, 95.3876312 , 95.60338027,
95.81912935, 96.03487842, 96.2506275 , 96.46637657,
96.68212564, 96.89787472, 97.11362379, 97.32937287,
97.54512194, 97.76087102, 97.97662009, 98.19236917,
98.40811824, 98.62386731, 98.83961639, 99.05536546,
99.27111454, 99.48686361, 99.70261269, 99.91836176,
100.13411084, 100.34985991, 100.56560899, 100.78135806,
100.99710713, 101.21285621])
yData = np.array([3.33333333e-04, 3.33333333e-04, 3.33333333e-04, 1.00000000e-03,
1.33333333e-03, 3.33333333e-03, 6.66666667e-03, 1.30000000e-02,
2.36666667e-02, 3.40000000e-02, 5.13333333e-02, 7.36666667e-02,
1.01666667e-01, 1.38666667e-01, 2.14000000e-01, 3.31000000e-01,
4.49666667e-01, 5.50000000e-01, 6.09000000e-01, 6.36000000e-01,
6.47000000e-01, 6.54666667e-01, 6.61000000e-01, 6.67000000e-01,
6.76333333e-01, 6.84000000e-01, 6.95666667e-01, 7.10000000e-01,
7.27666667e-01, 7.50666667e-01, 7.75333333e-01, 7.93333333e-01,
8.11333333e-01, 8.31333333e-01, 8.56333333e-01, 8.81333333e-01,
9.00666667e-01, 9.22666667e-01, 9.37666667e-01, 9.47333333e-01,
9.59000000e-01, 9.70333333e-01, 9.77333333e-01, 9.83333333e-01,
9.90333333e-01, 9.93666667e-01, 9.96333333e-01, 9.99000000e-01,
9.99666667e-01, 1.00000000e+00])
# generate initial parameter values
initialParameters = generate_Initial_Parameters()
# curve fit the data
fittedParameters, niepewnosci = curve_fit(two_gaussian_cdf, xData, yData, initialParameters)
# create values for display of fitted peak function
mu1, sigma1, mu2, sigma2 = fittedParameters
y_fit = two_gaussian_cdf(xData, mu1, sigma1, mu2, sigma2)
plt.plot(xData, yData) # plot the raw data
plt.plot(xData, y_fit) # plot the equation using the fitted parameters
plt.show()
print(fittedParameters)

Resources