Generating ande valuating 200 Normals and reducing them - python-3.x

I am trying to estimate a normal density using a quadratic approximation in tensorflow (code 4.14 from McElreath's Statistical Rethinking).
The code I have so far is:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma= tf.linspace(start=4.0, stop=9.0, num=200)
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(df.height))
This fails due to df having shape (352,) whilst I am creating (200,) points for my normal distribution to be evaluated on.
However
tf.reduce_sum(tfd.Normal(loc=mu, scale=sigma).log_prob(2))
and
tf.reduce_sum(tfd.Normal(loc=mu[0], scale=sigma[0]).log_prob(df.height))
both work.
I need to create a (200, 352) tensor - one Normal for each mu, sigma on my grid, and then evaluate it with my sample data - df. The question I have is: how do I do this?

I think TFP's joint distribution is a nice way to express this:
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
def mk_joint(nobs):
return tfd.JointDistributionNamed(dict(
mu=tfd.Normal(178, 20),
sigma=tfd.Uniform(0, 50),
height=lambda mu, sigma: tfd.Sample(tfd.Normal(loc=mu, scale=sigma), nobs)
))
joint = mk_joint(len(df))
joint.sample()
print(f'joint event shape: {joint.event_shape}')
lp = joint.log_prob(dict(mu=mu[:,tf.newaxis], sigma=sigma, height=df.height))
import matplotlib.pyplot as plt
plt.imshow(lp)
plt.xlabel('sigma')
plt.xticks(np.arange(len(sigma))[::10], sigma[::10].numpy().round(2), rotation=90)
plt.ylabel('mu')
plt.yticks(np.arange(len(mu))[::10], mu[::10].numpy().round(2))
plt.show()
=>
joint event shape: {'sigma': TensorShape([]), 'mu': TensorShape([]), 'height': TensorShape([352])}

So, I figured out that one way to do it would be to create a (200, 200, 352) grid and then reshape, and the rest of the calculations follow straightforwardly.
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
_BASE_URL = "https://raw.githubusercontent.com/rmcelreath/rethinking/Experimental/data"
HOWELL_DATASET_PATH = f"{_BASE_URL}/Howell1.csv"
df = pd.read_csv(HOWELL_DATASET_PATH, sep=';')
df = df[df['age'] >= 18]
mu = tf.linspace(start=140.0, stop=160.0, num=200)
sigma = tf.linspace(start=7.0, stop=9.0, num=200)
means, variances, _ = tf.meshgrid(mu, sigma, np.zeros((352,)).astype(np.float32))
means = tf.reshape(means, [40000, 352])
variances = tf.reshape(variances, [40000, 352])
normal = tfd.Normal(loc=means, scale=variances)
log_lik = tf.reduce_sum(normal.log_prob(df.height), axis=1)
logprob_mu = tfd.Normal(178.0, 20.0).log_prob(means)
logprob_sigma = tfd.Uniform(low=0.0, high=50.0).log_prob(variances)
log_joint_prod = log_lik + logprob_mu[:, 0] + logprob_sigma[:, 0]
joint_prob_tf = tf.exp(log_joint_prod - tf.reduce_max(log_joint_prod))

Related

When should I use matrix.astype and when matrix.view?

I have a matrix of int32 integers (50k x 50k or so) which I need to convert to float32. I can do that with
# Preparation for the example
import numpy as np
n = 50_000
matrix = np.random.randint(0, 10, (n, n), dtype='int32')
# Way 1:
matrix = matrix.astype(np.float32, copy=False)
# Way 2:
matrix = matrix.view(np.float32)
When should I use which one? Is there a speed-disadvantage in later use of a view compared to a "real" numpy array?
What I tried
Execution time analysis (creation, not later access)
import numpy as np
import timeit
def create_boxplot(duration_list, showfliers=False):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300, facecolor="w", edgecolor="k")
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(
*sorted(duration_list.items(), key=operator.itemgetter(1))
)
flierprops = dict(markerfacecolor="0.75", markersize=1, linestyle="none")
ax = sns.boxplot(
data=sorted_vals,
width=0.3,
orient="h",
flierprops=flierprops,
showfliers=showfliers,
)
ax.set(xlabel="Time in ms", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
n = 5_000
matrix = np.random.randint(0, 2, (n, n), dtype='int32')
print(matrix.dtype)
matrix = matrix.view(np.float32)
print(matrix.dtype)
timeit_d = {}
timeit_d["repeat"] = 500
timeit_d["number"] = 3
timeit_d["setup"] = "import numpy as np; n=5_000; matrix = np.random.randint(0, 2, (n, n), dtype='int32')"
duration_list = {}
# Way 1
durations = timeit.repeat(
"matrix2 = matrix.view(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["view"] = durations
print("Done views")
# Way 2
durations = timeit.repeat(
"matrix2 = matrix.astype(np.float32)",
setup=timeit_d["setup"],
repeat=timeit_d["repeat"],
number=timeit_d["number"],
)
duration_list["astype"] = durations
print("Done astype")
# Visualize
create_boxplot(duration_list)
Clearly views are way faster than astype.
Memory analysis
$ valgrind --tool=massif python3 foobar.py
$ massif-visualizer massif.out.view
Clearly shows that the view option uses WAY less memory.

How to have the best gaussian fit on a histogram plot

I have a histogram and I'm trying to fit the best norm(Gaussian) function as you can see below. the problem is that the gaussian fit isn't the best fit that I expected.
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.mlab as mlab
from astropy.modeling import models, fitting
bins=np.arange(-1,8,0.3)
#Reading data
a18 = np.loadtxt('AndXII18I.srt')
arr18 = np.array(a18[:,11])
axs[0,0].hist(arr18,bins,histtype='step')
axs[0,0].set_xlim([np.min(arr18), np.max(arr18)])
x = np.linspace(-1, bins[len(bins)-2],len(bins)-1)
x1 = np.linspace(-1, 8, 1000)
# guesses for the parameters:
g_init = models.Gaussian1D(1, 0, 1.)
fit_g = fitting.LevMarLSQFitter()
axs[0,0].plot(x1,t18)
axs[0,0].plot(edges18[8],hist18[8],'o')
g18 = fit_g(g_init, x, y18[0])
a18=g18.mean
t18=g18.amplitude*np.exp(-(x1-a18)**2/(2*g18.stddev**2))
plt.show()

Using scipy's solve_ivp to solve non linear pendulum motion

I am still trying to understand how solve_ivp works against odeint, but just as I was getting the hang of it something happened.
I am trying to solve for the motion of a non linear pendulum. With odeint, everything works like a charm, on solve_ivp hoever something weird happens:
import numpy as np
from matplotlib import pyplot as plt
from scipy.integrate import solve_ivp, odeint
g = 9.81
l = 0.1
def f(t, r):
omega = r[0]
theta = r[1]
return np.array([-g / l * np.sin(theta), omega])
time = np.linspace(0, 10, 1000)
init_r = [0, np.radians(179)]
results = solve_ivp(f, (0, 10), init_r, method="RK45", t_eval=time) #??????
cenas = odeint(f, init_r, time, tfirst=True)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(results.t, results.y[1])
ax1.plot(time, cenas[:, 1])
plt.show()
What am I missing?
It is a numerical problem. The default relative and absolute tolerances of solve_ivp are 1e-3 and 1e-6, respectively. For many problems, these values are too big, and tighter error tolerances should be given. The default relative tolerance for odeint is 1.49e-8.
If you add the argument rtol=1e-8 to the solve_ivp call, the plots agree:
import numpy as np
from matplotlib import pyplot as plt
from scipy.integrate import solve_ivp, odeint
g = 9.81
l = 0.1
def f(t, r):
omega = r[0]
theta = r[1]
return np.array([-g / l * np.sin(theta), omega])
time = np.linspace(0, 10, 1000)
init_r = [0, np.radians(179)]
results = solve_ivp(f, (0, 10), init_r, method='RK45', t_eval=time, rtol=1e-8)
cenas = odeint(f, init_r, time, tfirst=True)
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(results.t, results.y[1])
ax1.plot(time, cenas[:, 1])
plt.show()
Plot:

find best transformation matrix for two 2D data points

how to find best transformation matrix for aligning two 2D point set to get minimum Mean squared error value. this code is what I have done but this is not right: tform * src.
import numpy as np
from skimage import transform as tf
from sklearn.metrics import mean_squared_error
# estimate transformation parameters
src = np.array([0, 0, 10, 10]).reshape((2, 2))
dst = np.array([12, 14, 1, -20]).reshape((2, 2))
tform = tf.estimate_transform('similarity', src, dst)
print(src)
print(dst)
print(tform.params)
msq=mean_squared_error(tform*src,dst)
finally I could find a right answer for my question
import numpy as np
from skimage import transform as tf
from sklearn.metrics import mean_squared_error
# estimate transformation parameters
src = np.array([0,0 , 1,0 , 1,1 , 0,1]).reshape((4, 2))
dst = np.array([3,1 , 3,2 , 2,2 , 2,1]).reshape((4, 2))
tform = tf.estimate_transform('similarity', src, dst)
#tform is the transformation matrix for these data to align them
print(src)
print(dst)
print(tform.params)
mt = tf.matrix_transform(src, tform.params)#mt is the same dst
mean_squared_error(mt,dst) #should be zero
print( '{:.10f}'.format(mean_squared_error(mt,dst)) )

Wrong intercept in Spark linear regression

I am starting with Spark Linear Regression. I am trying to fit a line to a linear dataset. It seems that the intercept is not correctly adjusting, or probably I am missing something..
With intercept=False:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=False)
This seems normal. But when I use intercept=True:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=True)
The model that I get in the last case is exactly:
(weights=[0.0353471289751], intercept=1.0005127185289888)
I have tried with different datasets, step sizes and iterations, but always the model converges the intercept is about 1
EDIT - This is the code I am using:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
import numpy as np
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc = SparkContext("local", "regression")
# Generate data
SIZE = 300
SLOPE = 0.1
BASE = -30
NOISE = 10
x = np.arange(SIZE)
delta = np.random.uniform(-NOISE,NOISE, size=(SIZE,))
y = BASE + SLOPE*x + delta
data = zip(range(len(y)), y) # zip with index
dataRDD = sc.parallelize(data)
# Normalize data
# mean = np.mean(data)
# std = np.std(data)
# dataRDD = dataRDD.map(lambda r: (r[0], (float(r[1])-mean)/std))
labeledData = dataRDD.map(lambda r: LabeledPoint(float(r[1]), [float(r[0])]))
# Create linear model
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=1000, step=0.0002, intercept=True, convergenceTol=0.000001)
print linear_model
true_vs_predicted = labeledData.map(lambda p: (p.label, linear_model.predict(p.features))).collect()
# PLOT
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid()
y_real = [x[0] for x in true_vs_predicted]
y_pred = [x[1] for x in true_vs_predicted]
plt.plot(range(len(y_real)), y_real, 'o', markersize=5, c='b')
plt.plot(range(len(y_pred)), y_pred, 'o', markersize=5, c='r')
plt.show()
This is because the number of iterations and the step size are both smaller. As a result, The trial process is ending before reaching the local optima.

Resources