I'm trying to do a multi linear regression algorithm . My data has two features (size of the house in the first column, number of bedrooms in the second column) and here is the head of my data set (the third column is the price of the house):
[[2.10400e+03 3.00000e+00 3.99900e+05]
[1.60000e+03 3.00000e+00 3.29900e+05]
[2.40000e+03 3.00000e+00 3.69000e+05]
[1.41600e+03 2.00000e+00 2.32000e+05]
[3.00000e+03 4.00000e+00 5.39900e+05]]
I wrote the following algorithm to compute the cost :
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = np.loadtxt('data2.txt', delimiter=',')
y_data = list()
for i in range(len(data)):
y_data.append(data[i][2])
#Convert our list to numpy arrays
y_data = np.asarray(y_data)
x_data = data[:, 0:2]
#applying feature scaling
for i in range(len(x_data)):
#Size of the house
x_data[i][0] = (x_data[i][0] - np.mean(x_data[:, 0])) / np.std(x_data[:, 0])
#Number of rooms
x_data[i][1] = (x_data[i][1] - np.mean(x_data[:, 1])) / np.std(x_data[:, 1])
#Adding a column of ones to our data
x_data = np.c_[np.ones(len(x_data)), x_data]
def cost(x, y, theta):
m = len(x)
predictions = np.arange(97).reshape(97, 1)
for i in range(len(x)):
predictions[i] = (x[i] * theta).sum()
sqrErrors = (np.subtract(predictions, y)) ** 2
return (1 / ( 2 * m)) * sqrErrors.sum()
theta = [[0],
[0],
[0]
]
The cost functions returns a value of 6361101029137.691 and when i run the gradient descent it gets larger. So what's the problem and how can i fix it please? Thanks
Related
How can I compute the euclidean distance to the boundary decision of the EllipticEnvelope? Here is my code :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.model_selection import train_test_split
feature, output = "temperature", "consumption"
data = pd.DataFrame(np.random.normal(0,15, size=(2355,2)), columns=[feature, output])
X = data[[feature, output]]
X_train, X_test = train_test_split(X, shuffle=True, train_size=0.8)
model = EllipticEnvelope(contamination=0.18)
model.fit(X_train)
# extract the model predictions
y_pred = pd.Series(model.predict(X), index=X.index, name="anomaly")
# define the meshgrid : X = (u,v).T
u_min, u_max = X_train.iloc[:, 0].min() - 1.5, X_train.iloc[:, 0].max() + 1.5
v_min, v_max = X_train.iloc[:, 1].min() - 1.5, X_train.iloc[:, 1].max() + 1.5
n_points = 500
u = np.linspace(u_min, u_max, n_points)
v = np.linspace(v_min, v_max, n_points)
U, V = np.meshgrid(u, v)
# evaluate the decision function on the meshgrid
W = model.decision_function(np.c_[U.ravel(), V.ravel()])
W = W.reshape(U.shape)
plt.figure(figsize=(20,6))
a = plt.contour(U, V, W, levels=[0], linewidths=2, colors="black")
b = plt.scatter(X.loc[y_pred == 1].iloc[:, 0], X.loc[y_pred == 1].iloc[:, 1], c="yellowgreen", edgecolors='k')
c = plt.scatter(X.loc[y_pred == -1].iloc[:, 0], X.loc[y_pred == -1].iloc[:, 1], c="tomato", edgecolors='k')
plt.legend([a.collections[0], b, c], ['learned frontier', 'regular observations', 'abnormal observations'], bbox_to_anchor=(1.05, 1))
plt.axis('tight')
plt.show()
Edits
I am able to get the decision boundary points using the following code. Now, the problem can be solved by computing numerically the distance.
for item in a.collections:
for i in item.get_paths():
v = i.vertices
x = v[:, 0]
y = v[:, 1]
I have an obvious solution. Getting all data points d and compute the euclidean distance between d and e=(x,y). But, it is a brute-force technique.. :D I will continue my research !
Another solution would be to fit an ellipse and compute the distance using the formula described by #epiliam there : https://math.stackexchange.com/questions/3670465/calculate-distance-from-point-to-ellipse-edge
I will provide one solution tomorrow based on the brute-force. It seems to work well for small dataset (n_rows < 10000). I did not test for larger ones.
I'm trying to add python to my repertoire (R is my program of choice) and am having an issue with a simple line plot.
While the generated array (in this case, y) is of float type (which I want), when I plot a simple line plot using matplotlib, that same y is no truncated to the nearest whole integer.
Any help would be appreciated.
Thanks. Here's sample code.
P.S. Any hints as to cleaning up the code would also be more than welcome.
import sys
import numpy as np
from numpy import random
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as matplotlib
plt.style.use('ggplot')
greens = np.array([0,0])
others = np.array(np.arange(1,37))
# no axis provided, array elements will be flattened
roulette = np.append(greens, others)
spins1000 = np.array(random.choice(roulette, size=(1000)))
# Create function for cum mean in python
def cum_mean(arr):
cum_sum = np.cumsum(arr, axis=0)
for i in range(cum_sum.shape[0]):
if i == 0:
continue
print(cum_sum[i] / (i + 1))
cum_sum[i] = cum_sum[i] / (i + 1)
return cum_sum
y = np.array(cum_mean(spins1000))
x = np.array(np.arange(1,1001))
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(xlim=(0, 1000), ylim=(10.00, 25.00))
line = ax.plot(x, y, color='red', lw=1)[0]
plt.draw()
plt.show()
There are two things happening, which in combination cause the strange behavior.
cum_sum = np.cumsum(arr, axis=0) with arr being an array of integers, make cum_sum also an array of integers
in the loop, writing cum_sum[i] = cum_sum[i] / (i + 1) stores the result (which is a float) into an integer array; this storing rounds the number
A solution would either be to create cum_sum as float (as in cum_sum = np.cumsum(arr, dtype=float)). Or to do things "the numpy way", and create a new array in one go: return cum_sum / np.arange(1, cum_sum.shape[0] + 1). Note that numpy's array operations are vectorized, so dividing an array by an array gets the same result as dividing element by element. This runs quite faster (similar to what happens in R).
Also, if you would write cum_sum = cum_sum / np.arange(1, 1001), cum_sum would be a new float array. Only by accessing it element-by-element, the array stays an array of integers. Note that np.arange() already creates a numpy array, so calling np.array again doesn't change it.
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
greens = np.array([0, 0])
others = np.arange(1, 37)
# no axis provided, array elements will be flattened
roulette = np.append(greens, others)
spins1000 = np.array(np.random.choice(roulette, size=(1000)))
# Create function for cum mean in python
def cum_mean(arr):
cum_sum = np.cumsum(arr)
return cum_sum / (np.arange(1, cum_sum.shape[0] + 1))
y = cum_mean(spins1000)
x = np.arange(1, 1001)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(xlim=(0, 1000), ylim=(10.00, 25.00))
line = ax.plot(x, y, color='red', lw=1)[0]
plt.show()
I want to fit a 2 component mixture model with sklearn for then calculating back posterior probability. Butwith the code I have so far the fit for one of the two distributions is perfect (overfitting?) and other one is very poor. I made a dummy example with sampling 2 gaussian
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
def calc_pdf():
"""
calculate gauss mixture modelling for 2 comp
return pdfs
"""
d = np.random.normal(-0.1, 0.07, 5000)
t = np.random.normal(0.2, 0.13, 10000)
pool = np.concatenate([d, t]).reshape(-1,1)
label = ['d']*d.shape[0] + ['t'] * t.shape[0]
X = pool[pool>0].reshape(-1,1)
X = np.log(X)
clf = GaussianMixture(
n_components=2,
covariance_type='full',
tol = 1e-24,
max_iter = 1000
)
logprob = clf.fit(X).score_samples(X)
responsibilities = clf.predict_proba(X)
pdf = np.exp(logprob)
pdf_individual = responsibilities * pdf[:, np.newaxis]
plot_gauss(np.log(d), np.log(t), pdf_individual, X)
return pdf_individual[0], pdf_individual[1]
def plot_gauss(d, t, pdf_individual, x):
fig, ax = plt.subplots(figsize=(12, 9), facecolor='white')
ax.hist(d, 30, density=True, histtype='stepfilled', alpha=0.4)
ax.hist(t, 30, density=True, histtype='stepfilled', alpha=0.4)
ax.plot(x, pdf_individual, '.')
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')
plt.show()
calc_pdf()
which produces this plot here
Is there something obvious that I am missing?
I'm trying to get the four first order histogram statistics (mean,
variance, skewness and kurtosis) from a histogram.
I have this code that calculates the histogram:
import cv2
from matplotlib import pyplot as plt
img1 = 'img.jpg'
gray_img = cv2.imread(img1, cv2.IMREAD_GRAYSCALE)
plt.hist(gray_img.ravel(),256,[0,256])
plt.title('Histogram for gray scale picture')
plt.show()
How can I get that statistics?
Based on my answer here
def mean_h(val, freq):
return np.average(val, weights = freq)
def var_h(val, freq):
dev = freq * (val - mean_h(val, freq)) ** 2
return dev.sum() / freq.sum()
def moment_h(val, freq, n):
n = (freq * (val - mean_h(val, freq)) ** n).sum() / freq.sum()
d = var_h(val, freq) ** (n / 2)
return n / d
skewness and kurtosis are just the 3rd and 4th moments
If the number of bins is reasonable, you should be able to just count the values manually, put in a vector; and calculate all those moments.
I want to determine the best value of k (number of clusters) for the KMeans algo and a dataset.
I found a ressource in the documentation of Sklearn : The Gaussian Mixture Model Selection using the BIC criterion.
I found an example of code on the site that I adapted to my dataset.
But each run of this code give a different value of optimal value of k . Why ?
Here the code :
import numpy as np
import pandas as pd
import itertools
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import mixture
print(__doc__)
# Number of samples per component
n_samples = 440
path = 'C:/Users/Lionel/Downloads'
file = 'Wholesale customers data.csv'
data = pd.read_csv(path + '/'+file)
X = np.array(data.iloc[:,2 :])
lowest_bic = np.infty
bic = []
n_components_range = range(1, 12)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components,
covariance_type=cv_type)
gmm.fit(X)
bic.append(gmm.bic(X))
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
bic = np.array(bic)
color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
'darkorange'])
clf = best_gmm
print(clf)
bars = []
# Plot the BIC scores
spl = plt.subplot(2, 1, 1)
#spl = plt.plot()
for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
xpos = np.array(n_components_range) + .2 * (i - 2)
bars.append(plt.bar(xpos, bic[i * len(n_components_range):
(i + 1) * len(n_components_range)],
width=.2, color=color))
plt.xticks(n_components_range)
plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
plt.title('BIC score per model')
xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
.2 * np.floor(bic.argmin() / len(n_components_range))
plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
spl.set_xlabel('Number of components')
spl.legend([b[0] for b in bars], cv_types)
# Plot the winner
splot = plt.subplot(2, 1, 2)
Y_ = clf.predict(X)
for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
color_iter)):
v, w = linalg.eigh(cov)
if not np.any(Y_ == i):
continue
plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
# Plot an ellipse to show the Gaussian component
angle = np.arctan2(w[0][1], w[0][0])
angle = 180. * angle / np.pi # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)
ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
ell.set_clip_box(splot.bbox)
ell.set_alpha(.5)
splot.add_artist(ell)
plt.xticks(())
plt.yticks(())
plt.title('Selected GMM: full model, 2 components')
plt.subplots_adjust(hspace=.35, bottom=.02)
plt.show()
Here the link to my dataset :
https://drive.google.com/open?id=1yMw1rMh12ml6Lh3yrL6WDLbEnLM-SmiN
Have you an explanation for this behaviour ?