I am trying to estimate a normal density using a quadratic approximation in tensorflow (code 4.14 from McElreath's Statistical Rethinking).
The code I have so far is:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma= tf.linspace(start=4.0, stop=9.0, num=200)
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(df.height))
This fails due to df having shape (352,) whilst I am creating (200,) points for my normal distribution to be evaluated on.
However
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(2))
and
tf.reduce_sum(tfd.Normal(loc=mu[0], scale=sigma[0]).log_prob(df.height))
both work.
I need to create a (200, 352) tensor - one Normal for each mu, sigma on my grid, and then evaluate it with my sample data - df. The question I have is: how do I do this?
I think TFP's joint distribution is a nice way to express this:
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
def mk_joint(nobs):
return tfd.JointDistributionNamed(dict(
mu=tfd.Normal(178, 20),
sigma=tfd.Uniform(0, 50),
height=lambda mu, sigma: tfd.Sample(tfd.Normal(loc=mu, scale=sigma), nobs)
))
joint = mk_joint(len(df))
joint.sample()
print(f'joint event shape: {joint.event_shape}')
lp = joint.log_prob(dict(mu=mu[:,tf.newaxis], sigma=sigma, height=df.height))
import matplotlib.pyplot as plt
plt.imshow(lp)
plt.xlabel('sigma')
plt.xticks(np.arange(len(sigma))[::10], sigma[::10].numpy().round(2), rotation=90)
plt.ylabel('mu')
plt.yticks(np.arange(len(mu))[::10], mu[::10].numpy().round(2))
plt.show()
=>
joint event shape: {'sigma': TensorShape([]), 'mu': TensorShape([]), 'height': TensorShape([352])}
So, I figured out that one way to do it would be to create a (200, 200, 352) grid and then reshape, and the rest of the calculations follow straightforwardly.
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
means, variances, _ = tf.meshgrid(mu, sigma, np.zeros((352,)).astype(np.float32))
means = tf.reshape(means, [40000, 352])
variances = tf.reshape(variances, [40000, 352])
normal = tfd.Normal(loc=means, scale=variances)
log_lik = tf.reduce_sum(normal.log_prob(df.height), axis=1)
logprob_mu = tfd.Normal(178.0, 20.0).log_prob(means)
logprob_sigma = tfd.Uniform(low=0.0, high=50.0).log_prob(variances)
log_joint_prod = log_lik + logprob_mu[:, 0] + logprob_sigma[:, 0]
joint_prob_tf = tf.exp(log_joint_prod - tf.reduce_max(log_joint_prod))
Related
I have a matrix of int32 integers (50k x 50k or so) which I need to convert to float32. I can do that with
# Preparation for the example
import numpy as np
n = 50_000
matrix = np.random.randint(0, 10, (n, n), dtype='int32')
# Way 1:
matrix = matrix.astype(np.float32, copy=False)
# Way 2:
matrix = matrix.view(np.float32)
When should I use which one? Is there a speed-disadvantage in later use of a view compared to a "real" numpy array?
What I tried
Execution time analysis (creation, not later access)
import numpy as np
import timeit
def create_boxplot(duration_list, showfliers=False):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300, facecolor="w", edgecolor="k")
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(
*sorted(duration_list.items(), key=operator.itemgetter(1))
)
flierprops = dict(markerfacecolor="0.75", markersize=1, linestyle="none")
ax = sns.boxplot(
data=sorted_vals,
width=0.3,
orient="h",
flierprops=flierprops,
showfliers=showfliers,
)
ax.set(xlabel="Time in ms", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
n = 5_000
matrix = np.random.randint(0, 2, (n, n), dtype='int32')
print(matrix.dtype)
matrix = matrix.view(np.float32)
print(matrix.dtype)
timeit_d = {}
timeit_d["repeat"] = 500
timeit_d["number"] = 3
timeit_d["setup"] = "import numpy as np; n=5_000; matrix = np.random.randint(0, 2, (n, n), dtype='int32')"
duration_list = {}
# Way 1
durations = timeit.repeat(
"matrix2 = matrix.view(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["view"] = durations
print("Done views")
# Way 2
durations = timeit.repeat(
"matrix2 = matrix.astype(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["astype"] = durations
print("Done astype")
# Visualize
create_boxplot(duration_list)
Clearly views are way faster than astype.
Memory analysis
$ valgrind --tool=massif python3 foobar.py
$ massif-visualizer massif.out.view
Clearly shows that the view option uses WAY less memory.
I have a histogram and I'm trying to fit the best norm(Gaussian) function as you can see below. the problem is that the gaussian fit isn't the best fit that I expected.
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.mlab as mlab
from astropy.modeling import models, fitting
bins=np.arange(-1,8,0.3)
#Reading data
a18 = np.loadtxt('AndXII18I.srt')
arr18 = np.array(a18[:,11])
axs[0,0].hist(arr18,bins,histtype='step')
axs[0,0].set_xlim([np.min(arr18), np.max(arr18)])
x = np.linspace(-1, bins[len(bins)-2],len(bins)-1)
x1 = np.linspace(-1, 8, 1000)
# guesses for the parameters:
g_init = models.Gaussian1D(1, 0, 1.)
fit_g = fitting.LevMarLSQFitter()
axs[0,0].plot(x1,t18)
axs[0,0].plot(edges18[8],hist18[8],'o')
g18 = fit_g(g_init, x, y18[0])
a18=g18.mean
t18=g18.amplitude*np.exp(-(x1-a18)**2/(2*g18.stddev**2))
plt.show()
I am still trying to understand how solve_ivp works against odeint, but just as I was getting the hang of it something happened.
I am trying to solve for the motion of a non linear pendulum. With odeint, everything works like a charm, on solve_ivp hoever something weird happens:
import numpy as np
from matplotlib import pyplot as plt
from scipy.integrate import solve_ivp, odeint
g = 9.81
l = 0.1
def f(t, r):
omega = r[0]
theta = r[1]
return np.array([-g / l * np.sin(theta), omega])
time = np.linspace(0, 10, 1000)
init_r = [0, np.radians(179)]
results = solve_ivp(f, (0, 10), init_r, method="RK45", t_eval=time) #??????
cenas = odeint(f, init_r, time, tfirst=True)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(results.t, results.y[1])
ax1.plot(time, cenas[:, 1])
plt.show()
What am I missing?
It is a numerical problem. The default relative and absolute tolerances of solve_ivp are 1e-3 and 1e-6, respectively. For many problems, these values are too big, and tighter error tolerances should be given. The default relative tolerance for odeint is 1.49e-8.
If you add the argument rtol=1e-8 to the solve_ivp call, the plots agree:
import numpy as np
from matplotlib import pyplot as plt
from scipy.integrate import solve_ivp, odeint
g = 9.81
l = 0.1
def f(t, r):
omega = r[0]
theta = r[1]
return np.array([-g / l * np.sin(theta), omega])
time = np.linspace(0, 10, 1000)
init_r = [0, np.radians(179)]
results = solve_ivp(f, (0, 10), init_r, method='RK45', t_eval=time, rtol=1e-8)
cenas = odeint(f, init_r, time, tfirst=True)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(results.t, results.y[1])
ax1.plot(time, cenas[:, 1])
plt.show()
Plot:
how to find best transformation matrix for aligning two 2D point set to get minimum Mean squared error value. this code is what I have done but this is not right: tform * src.
import numpy as np
from skimage import transform as tf
from sklearn.metrics import mean_squared_error
# estimate transformation parameters
src = np.array([0, 0, 10, 10]).reshape((2, 2))
dst = np.array([12, 14, 1, -20]).reshape((2, 2))
tform = tf.estimate_transform('similarity', src, dst)
print(src)
print(dst)
print(tform.params)
msq=mean_squared_error(tform*src,dst)
finally I could find a right answer for my question
import numpy as np
from skimage import transform as tf
from sklearn.metrics import mean_squared_error
# estimate transformation parameters
src = np.array([0,0 , 1,0 , 1,1 , 0,1]).reshape((4, 2))
dst = np.array([3,1 , 3,2 , 2,2 , 2,1]).reshape((4, 2))
tform = tf.estimate_transform('similarity', src, dst)
#tform is the transformation matrix for these data to align them
print(src)
print(dst)
print(tform.params)
mt = tf.matrix_transform(src, tform.params)#mt is the same dst
mean_squared_error(mt,dst) #should be zero
print( '{:.10f}'.format(mean_squared_error(mt,dst)) )
I am starting with Spark Linear Regression. I am trying to fit a line to a linear dataset. It seems that the intercept is not correctly adjusting, or probably I am missing something..
With intercept=False:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=False)
This seems normal. But when I use intercept=True:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=True)
The model that I get in the last case is exactly:
(weights=[0.0353471289751], intercept=1.0005127185289888)
I have tried with different datasets, step sizes and iterations, but always the model converges the intercept is about 1
EDIT - This is the code I am using:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
import numpy as np
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc = SparkContext("local", "regression")
# Generate data
SIZE = 300
SLOPE = 0.1
BASE = -30
NOISE = 10
x = np.arange(SIZE)
delta = np.random.uniform(-NOISE,NOISE, size=(SIZE,))
y = BASE + SLOPE*x + delta
data = zip(range(len(y)), y) # zip with index
dataRDD = sc.parallelize(data)
# Normalize data
# mean = np.mean(data)
# std = np.std(data)
# dataRDD = dataRDD.map(lambda r: (r[0], (float(r[1])-mean)/std))
labeledData = dataRDD.map(lambda r: LabeledPoint(float(r[1]), [float(r[0])]))
# Create linear model
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=1000, step=0.0002, intercept=True, convergenceTol=0.000001)
print linear_model
true_vs_predicted = labeledData.map(lambda p: (p.label, linear_model.predict(p.features))).collect()
# PLOT
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid()
y_real = [x[0] for x in true_vs_predicted]
y_pred = [x[1] for x in true_vs_predicted]
plt.plot(range(len(y_real)), y_real, 'o', markersize=5, c='b')
plt.plot(range(len(y_pred)), y_pred, 'o', markersize=5, c='r')
plt.show()
This is because the number of iterations and the step size are both smaller. As a result, The trial process is ending before reaching the local optima.