Model selection & Selecting the number of active components in Bayesian Gaussian Mixture Models - scikit-learn

I have generated 2 groups of 1-D data points which are visually clearly separable and I want to use a Bayesian Gaussian Mixture Model (BGMM) to ideally recover 2 clusters.
Since BGMMs maximize a lower bound on the model evidence (ELBO) and given that the ELBO is supposed to combine notions of accuracy and complexity, I would expect more complex models to be penalized.
However, when running Grid Search over the number of clusters, I often get a solution with more than 2 clusters. More specifically, I often get the maximal number of clusters on my grid search. In the example below, I would expect the best model to define 2 clusters. Instead, the best models defines 4 but assigns minimal weights to 2 out of 4 clusters.
I am really surprised, since 2 out of 4 clusters are therefore adding little information and this more complex model still gets selected as the best model.
Why is the BGMM then picking 4 clusters for the best model?
If this is indeed the behavior a BGMM should show, how can I then assess how many active components I actually have in my model? Visually? By defining an arbitrary threshold on the weights?
I have added the code to reproduce my example below.
# Import statements
import itertools
import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from joblib import Parallel, delayed
from sklearn.mixture import BayesianGaussianMixture
from sklearn.utils import shuffle
def fitmodel(x, params):
'''
Instantiates and fits Bayesian GMM
Used in the parallel for loop
'''
# Gaussian mixture model
clf = BayesianGaussianMixture(**params)
# Fit
clf = clf.fit(x, y=None)
return clf
def plot_results(X, means, covariances, title):
plt.plot(X, np.random.uniform(low=0, high=1, size=len(X)),'o', alpha=0.1, color='cornflowerblue', label='data points')
for i, (mean, covar) in enumerate(zip(
means, covariances)):
# Get normal PDF
n_sd = 2.5
x = np.linspace(mean - n_sd*covar, mean + n_sd*covar, 300)
x = x.ravel()
y = stats.norm.pdf(x, mean, covar).ravel()
if i == 0:
label = 'Component PDF'
else:
label = None
plt.plot(x, y, color='darkorange', label=label)
plt.yticks(())
plt.title(title)
# Generate data
g1 = np.random.uniform(low=-1.5, high=-1, size=(1,100))
g2 = np.random.uniform(low=1.5, high=1, size=(1,100))
X = np.append(g1, g2)
# Shuffle data
X = shuffle(X)
X = X.reshape(-1, 1)
# Define parameters for grid search
parameters = {
'n_components': [1, 2, 3, 4],
'weight_concentration_prior_type':['dirichlet_distribution']
}
# Create permutations of parameter settings
keys, values = zip(*parameters.items())
param_grid = [dict(zip(keys, v)) for v in itertools.product(*values)]
# Run GridSearch using parallel for loop
list_clf = [None] * len(param_grid)
num_cores = multiprocessing.cpu_count()
list_clf = Parallel(n_jobs=num_cores)(delayed(fitmodel)(X, params) for params in param_grid)
# Print best model (based on lower bound on model evidence)
lower_bounds = [x.lower_bound_ for x in list_clf] # Extract lower bounds on model evidence
idx = int(np.where(lower_bounds == np.max(lower_bounds))[0]) # Find best model
best_estimator = list_clf[idx]
print(f'Parameter setting of best model: {param_grid[idx]}')
print(f'Components weights: {best_estimator.weights_}')
# Plot data points and gaussian components
plt.figure(figsize=(8,6))
ax = plt.subplot(2, 1, 1)
if best_estimator.weight_concentration_prior_type == 'dirichlet_process':
prior_label = 'Dirichlet process'
elif best_estimator.weight_concentration_prior_type == 'dirichlet_distribution':
prior_label = 'Dirichlet distribution'
plot_results(X, best_estimator.means_, best_estimator.covariances_,
f'Best Bayesian GMM | {prior_label} prior')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.legend(fontsize='small')
# Plot histogram of weights
ax = plt.subplot(2, 1, 2)
for k, w in enumerate(best_estimator.weights_):
plt.bar(k, w,
width=0.9,
color='#56B4E9',
zorder=3,
align='center',
edgecolor='black'
)
plt.text(k, w + 0.01, "%.1f%%" % (w * 100.),
horizontalalignment='center')
ax.get_xaxis().set_tick_params(direction='out')
ax.yaxis.grid(True, alpha=0.7)
plt.xticks(range(len(best_estimator.weights_)))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.4)
plt.ylabel('Component weight')
plt.ylim(0, np.max(best_estimator.weights_)+0.25*np.max(best_estimator.weights_))
plt.yticks(())
plt.savefig('bgmm_clustering.png')
plt.show()
plt.close()

Related

Sklearn's TfidfTransformer(use_idf=False, norm=None) returns the same output as CountVectorizer()

I am trying to understand the code behind TfidfTransformer(). From sklearn's documentation, I can get the term frequencies by setting use_idf=False. But when I check the code on Github, I noticed that the TfidfTransformer() will return the same value as CountVectorizer() when not using normalization, which is just the count of each term.
The code that is supposed to calculate term frequencies.
def transform(self, x, copy=True):
"""Transform a count matrix to a tf or tf-idf representation.
Parameters
----------
X : sparse matrix of (n_samples, n_features)
A matrix of term/token counts.
copy : bool, default=True
Whether to copy X and operate on the copy or perform in-place
operations.
Returns
-------
vectors : sparse matrix of shape (n_samples, n_features)
Tf-idf-weighted document-term matrix.
"""
X = self._validate_data(
X, accept_sparse="csr", dtype=FLOAT_DTYPES, copy-copy, reset=False
)
if not sp.issparse(X):
X = sp.csr_matrix(X, dtype=np.float64)
if self.sublinear_tf:
np.log(X.data, X.data)
X.data += 1
if self.use_idf:
# idf being a property, the automatic attributes detection
# does not work as usual and we need to specify the attribute not fitted")
# name:
check_is_fitted (self, attributes=["idf_"], msg="idf vector is not fitted")
# *= doesn't work
X = X * self._idf_diag
if self.norm is not None:
X = normalize(X, norm=self.norm, copy=False)
return X
image of code above
To investigate more, I ran both classes and compared the output of both CountVectorizer and TfidfTransformer using the following code and the output is equal.
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=(
'headers', 'footers', 'quotes'), subset='train', categories=['sci.electronics', 'rec.autos', 'rec.sport.hockey'])
train_documents = dataset.data
vectorizer = CountVectorizer()
train_documents_mat = vectorizer.fit_transform(train_documents)
tf_vectorizer = TfidfTransformer(use_idf=False, norm=None)
train_documents_mat_2 = tf_vectorizer.fit_transform(train_documents_mat)
equal = np.array_equal(
train_documents_mat.toarray(),
train_documents_mat_2.toarray()
)
print(equal)
I am trying to get the term frequencies for my documents rather than just the count. Any ideas why sklearn implement TF-IDF in this way?

Draw 3D Plot for Gensim model

I have trained my model using Gensim. I draw a 2D plot using PCA but it is not clear too much. I wanna change it to 3D with capable of zooming .my result is so dense.
from sklearn.decomposition import PCA
from matplotlib import pyplot
X=model[model.wv.vocab]
pca=PCA(n_components=2)
result=pca.fit_transform(X)
pyplot.scatter(result[:,0],result[:,1])
word=list(model.wv.most_similar('eden_lake'))
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
And the result:
it possible to do that?
The following function uses t-SNE instead of PCA for dimension reduction, but will generate a plot in two, three or both two and three dimensions (using subplots). Furthermore, it will color the topics for you so it's easier to distinguish them. Adding %matplotlib notebook to the start of a Jupyter notebook environment from anaconda will allow a 3d plot to be rotated and a 2d plot to be zoomed (don't do both versions at the same time with %matplotlib notebook).
The function is very long, with most of the code being for plot formatting, but produces a professional output.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from gensim.models import LdaModel
from gensim import corpora
from sklearn.manifold import TSNE
# %matplotlib notebook # if in Jupyter for rotating and zooming
def LDA_tSNE_topics_vis(dimension='both',
corpus=None,
num_topics=10,
remove_3d_outliers=False,
save_png=False):
"""
Returns the outputs of an LDA model plotted using t-SNE (t-distributed Stochastic Neighbor Embedding)
Note: t-SNE reduces the dimensionality of a space such that similar points will be closer and dissimilar points farther
Parameters
----------
dimension : str (default=both)
The dimension that t-SNE should reduce the data to for visualization
Options: 2d, 3d, and both (a plot with two subplots)
corpus : list, list of lists
The tokenized and cleaned text corpus over which analysis should be done
num_topics : int (default=10)
The number of categories for LDA based approaches
remove_3d_outliers : bool (default=False)
Whether to remove outliers from a 3d plot
save_png : bool (default=False)
Whether to save the figure as a png
Returns
-------
A t-SNE lower dimensional representation of an LDA model's topics and their constituent members
"""
dirichlet_dict = corpora.Dictionary(corpus)
bow_corpus = [dirichlet_dict.doc2bow(text) for text in corpus]
dirichlet_model = LdaModel(corpus=bow_corpus,
id2word=dirichlet_dict,
num_topics=num_topics,
update_every=1,
chunksize=len(bow_corpus),
passes=10,
alpha='auto',
random_state=42) # set for testing
df_topic_coherences = pd.DataFrame(columns = ['topic_{}'.format(i) for i in range(num_topics)])
for i in range(len(bow_corpus)):
df_topic_coherences.loc[i] = [0] * num_topics
output = dirichlet_model.__getitem__(bow=bow_corpus[i], eps=0)
for j in range(len(output)):
topic_num = output[j][0]
coherence = output[j][1]
df_topic_coherences.iloc[i, topic_num] = coherence
for i in range(num_topics):
df_topic_coherences.iloc[:, i] = df_topic_coherences.iloc[:, i].astype('float64', copy=False)
df_topic_coherences['main_topic'] = df_topic_coherences.iloc[:, :num_topics].idxmax(axis=1)
if num_topics > 10:
# cubehelix better for more than 10 colors
colors = sns.color_palette("cubehelix", num_topics)
else:
# The default sns color palette
colors = sns.color_palette('deep', num_topics)
tsne_2 = None
tsne_3 = None
if dimension == 'both':
tsne_2 = TSNE(n_components=2, perplexity=40, n_iter=300)
tsne_3 = TSNE(n_components=3, perplexity=40, n_iter=300)
elif dimension == '2d':
tsne_2 = TSNE(n_components=2, perplexity=40, n_iter=300)
elif dimension == '3d':
tsne_3 = TSNE(n_components=3, perplexity=40, n_iter=300)
else:
ValueError("An invalid value has been passed to the 'dimension' argument - choose from 2d, 3d, or both.")
if tsne_2 is not None:
tsne_results_2 = tsne_2.fit_transform(df_topic_coherences.iloc[:, :num_topics])
df_tsne_2 = pd.DataFrame()
df_tsne_2['tsne-2d-d1'] = tsne_results_2[:,0]
df_tsne_2['tsne-2d-d2'] = tsne_results_2[:,1]
df_tsne_2['main_topic'] = df_topic_coherences.iloc[:, num_topics]
df_tsne_2['color'] = [colors[int(t.split('_')[1])] for t in df_tsne_2['main_topic']]
df_tsne_2['topic_num'] = [int(i.split('_')[1]) for i in df_tsne_2['main_topic']]
df_tsne_2 = df_tsne_2.sort_values(['topic_num'], ascending = True).drop('topic_num', axis=1)
if tsne_3 is not None:
colors = [c for c in sns.color_palette()]
tsne_results_3 = tsne_3.fit_transform(df_topic_coherences.iloc[:, :num_topics])
df_tsne_3 = pd.DataFrame()
df_tsne_3['tsne-3d-d1'] = tsne_results_3[:,0]
df_tsne_3['tsne-3d-d2'] = tsne_results_3[:,1]
df_tsne_3['tsne-3d-d3'] = tsne_results_3[:,2]
df_tsne_3['main_topic'] = df_topic_coherences.iloc[:, num_topics]
df_tsne_3['color'] = [colors[int(t.split('_')[1])] for t in df_tsne_3['main_topic']]
df_tsne_3['topic_num'] = [int(i.split('_')[1]) for i in df_tsne_3['main_topic']]
df_tsne_3 = df_tsne_3.sort_values(['topic_num'], ascending = True).drop('topic_num', axis=1)
if remove_3d_outliers:
# Remove those rows with values that are more than three standard deviations from the column mean
for col in ['tsne-3d-d1', 'tsne-3d-d2', 'tsne-3d-d3']:
df_tsne_3 = df_tsne_3[np.abs(df_tsne_3[col] - df_tsne_3[col].mean()) <= (3 * df_tsne_3[col].std())]
if tsne_2 is not None and tsne_3 is not None:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, # pylint: disable=unused-variable
figsize=(20,10))
ax1.axis('off')
else:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,10))
if tsne_2 is not None and tsne_3 is not None:
# Plot tsne_2, with tsne_3 being added later
ax1 = sns.scatterplot(data=df_tsne_2, x="tsne-2d-d1", y="tsne-2d-d2",
hue=df_topic_coherences.iloc[:, num_topics], alpha=0.3)
light_grey_tup = (242/256, 242/256, 242/256)
ax1.set_facecolor(light_grey_tup)
ax1.axes.set_title('t-SNE 2-Dimensional Representation', fontsize=25)
ax1.set_xlabel('tsne-d1', fontsize=20)
ax1.set_ylabel('tsne-d2', fontsize=20)
handles, labels = ax1.get_legend_handles_labels()
legend_order = list(np.argsort([i.split('_')[1] for i in labels]))
ax1.legend([handles[i] for i in legend_order], [labels[i] for i in legend_order],
facecolor=light_grey_tup)
elif tsne_2 is not None:
# Plot just tsne_2
ax = sns.scatterplot(data=df_tsne_2, x="tsne-2d-d1", y="tsne-2d-d2",
hue=df_topic_coherences.iloc[:, num_topics], alpha=0.3)
ax.set_facecolor(light_grey_tup)
ax.axes.set_title('t-SNE 2-Dimensional Representation', fontsize=25)
ax.set_xlabel('tsne-d1', fontsize=20)
ax.set_ylabel('tsne-d2', fontsize=20)
handles, labels = ax.get_legend_handles_labels()
legend_order = list(np.argsort([i.split('_')[1] for i in labels]))
ax.legend([handles[i] for i in legend_order], [labels[i] for i in legend_order],
facecolor=light_grey_tup)
if tsne_2 is not None and tsne_3 is not None:
# tsne_2 has been plotted, so add tsne_3
ax2 = fig.add_subplot(121, projection='3d')
ax2.scatter(xs=df_tsne_3['tsne-3d-d1'],
ys=df_tsne_3['tsne-3d-d2'],
zs=df_tsne_3['tsne-3d-d3'],
c=df_tsne_3['color'],
alpha=0.3)
ax2.set_facecolor('white')
ax2.axes.set_title('t-SNE 3-Dimensional Representation', fontsize=25)
ax2.set_xlabel('tsne-d1', fontsize=20)
ax2.set_ylabel('tsne-d2', fontsize=20)
ax2.set_zlabel('tsne-d3', fontsize=20)
with plt.rc_context({"lines.markeredgewidth" : 0}):
# Add handles via blank lines and order their colors to match tsne_2
proxy_handles = [Line2D([0], [0], linestyle="none", marker='o', markersize=8,
markerfacecolor=colors[i]) for i in legend_order]
ax2.legend(proxy_handles, ['topic_{}'.format(i) for i in range(num_topics)],
loc='upper left', facecolor=(light_grey_tup))
elif tsne_3 is not None:
# Plot just tsne_3
ax.axis('off')
ax.set_facecolor('white')
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=df_tsne_3['tsne-3d-d1'],
ys=df_tsne_3['tsne-3d-d2'],
zs=df_tsne_3['tsne-3d-d3'],
c=df_tsne_3['color'],
alpha=0.3)
ax.set_facecolor('white')
ax.axes.set_title('t-SNE 3-Dimensional Representation', fontsize=25)
ax.set_xlabel('tsne-d1', fontsize=20)
ax.set_ylabel('tsne-d2', fontsize=20)
ax.set_zlabel('tsne-d3', fontsize=20)
with plt.rc_context({"lines.markeredgewidth" : 0}):
# Add handles via blank lines
proxy_handles = [Line2D([0], [0], linestyle="none", marker='o', markersize=8,
markerfacecolor=colors[i]) for i in range(len(colors))]
ax.legend(proxy_handles, ['topic_{}'.format(i) for i in range(num_topics)],
loc='upper left', facecolor=light_grey_tup)
if save_png:
plt.savefig('LDA_tSNE_{}.png'.format(time.strftime("%Y%m%d-%H%M%S")), bbox_inches='tight', dpi=500)
plt.show()
An example plot for both 2d and 3d (with outliers removed) representations of a 10 topic gensim LDA model on subplots would be:
Yes, in principle it is possible to do 3D visualization of LDA model results. Here is more information about using T-SNE for that.

Python Scipy Curvefit to Linear Quadratic Curve

I'm trying to fit a linear quadratic model curve to experiment data. The Y axis values reduce from 1 to 10^-5. When I use the following code, the resulting curve often seems to not fit the data at higher X values. I have a suspicion that because the Y values at high X values are so small, the resulting difference between the experiment value and model value is small. But I would like the model curve to pass as close to the higher X value points as possible (even if it means the low values are not as well fitted). I haven't found anything about weighting in scipy.optimize.curve_fit, other than using standard deviations (which I don't have). How can I improve my model fit at high X values?
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
def lq(x, a, b):
#y(x) = exp[-(ax+bx²)]
y = []
for i in x:
x2=i**2
ax = a*i
bx2 = b*x2
y.append(np.exp(-(ax+bx2)))
return y
#x and y are from experiment
x=[0,1.778,2.921,3.302,6.317,9.524,10.54]
y=[1,0.831763771,0.598411595,0.656145266,0.207014135,0.016218101,0.004102041]
(a,b), pcov = curve_fit(lq, x, y, p0=[0.05,0.05])
#make the model curve using a and b
xmodel = list(range(0,20))
ymodel = lq(xmodel, a, b)
fig, ax1 = plt.subplots()
ax1.set_yscale('log')
ax1.plot(x,y, "ro", label="Experiment")
ax1.plot(xmodel,ymodel, "r--", label="Model")
plt.show()
I agree with your assessment that the fit is not very sensitive to small misfits for the small values of y. Since you are plotting the data and fit on a semi-log plot, I think that what you really want is to fit in the log-space as well. That is, you could fit log(y) to a quadratic function. As an aside (but an important one if you're going to be doing numerical work with Python), you should not loop over lists but rather use numpy arrays: this will make everything faster and simpler. With such changes, your script might look like
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
def lq(x, a, b):
return -(a*x+b*x*x)
x = np.array([0,1.778,2.921,3.302,6.317,9.524,10.54])
y = np.array([1,0.831763771,0.598411595,0.656145266,0.207014135,0.016218101,0.004102041])
(a,b), pcov = curve_fit(lq, x, np.log(y), p0=[0.05,0.05])
xmodel = np.arange(20) # Note: use numpy!
ymodel = np.exp(lq(xmodel, a, b)) # Note: take exp() as inverse log()
fig, ax1 = plt.subplots()
ax1.set_yscale('log')
ax1.plot(x, y, "ro", label="Experiment")
ax1.plot(xmodel,ymodel, "r--", label="Model")
plt.show()
Note that the model function is changed to just be the ax+bx^2 you wanted to write in the first place and that this is now fitting np.log(y), not y. This will give a much more satisfying fit at the smaller y values.
You might also find lmfit (https://lmfit.github.io/lmfit-py/) helpful for this problem (disclaimer: I am a lead author). With this, your fit script could become
from lmfit import Model
model = Model(lq)
params = model.make_params(a=0.05, b=0.05)
result = model.fit(np.log(y), params, x=x)
print(result.fit_report())
xmodel = np.arange(20)
ymodel = np.exp(result.eval(x=xmodel))
plt.plot(x, y, "ro", label="Experiment")
plt.plot(xmodel, ymodel, "r--", label="Model")
plt.yscale('log')
plt.legend()
plt.show()
This will print out a report including fit statistics and interpretable uncertainties and correlations between variables:
[[Model]]
Model(lq)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 7
# data points = 7
# variables = 2
chi-square = 0.16149397
reduced chi-square = 0.03229879
Akaike info crit = -22.3843833
Bayesian info crit = -22.4925630
[[Variables]]
a: -0.05212688 +/- 0.04406602 (84.54%) (init = 0.05)
b: 0.05274458 +/- 0.00479056 (9.08%) (init = 0.05)
[[Correlations]] (unreported correlations are < 0.100)
C(a, b) = -0.968
and give a plot of
Note that lmfit Parameters can be fixed or bounded and that lmfit comes with many built-in models.
Finally, if you were to include a constant term in the quadratic model, you would not really need an iterative method but could use polynomial regression, as with numpy.polyfit.
Here is a graphical Python fitter using your data with a Gompertz type of sigmoidal equation. This code uses scipy's Differential Evolution genetic algorithm module to determine initial parameter estimates for scipy's non-linear curve_fit() routine. That scipy module uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, requiring bounds within which to search. In this example, I made all of the parameter search bounds from -2.0 to 2.0, and that seems to work in this case. Note that it is much easier to provide ranges for the initial parameter estimates than specific values, and those parameter ranges can be generous.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings
#x and y are from experiment
x=[0,1.778,2.921,3.302,6.317,9.524,10.54]
y=[1,0.831763771,0.598411595,0.656145266,0.207014135,0.016218101,0.004102041]
# alias data to match previous example code
xData = numpy.array(x, dtype=float)
yData = numpy.array(y, dtype=float)
def func(x, a, b, c): # Sigmoidal Gompertz C from zunzun.com
return a * numpy.exp(b * numpy.exp(c*x))
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
parameterBounds = []
parameterBounds.append([-2.0, 2.0]) # search bounds for a
parameterBounds.append([-2.0, 2.0]) # search bounds for b
parameterBounds.append([-2.0, 2.0]) # search bounds for c
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
# by default, differential_evolution completes by calling curve_fit() using parameter bounds
geneticParameters = generate_Initial_Parameters()
# now call curve_fit without passing bounds from the genetic algorithm,
# just in case the best fit parameters are aoutside those bounds
fittedParameters, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters:', fittedParameters)
print()
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print()
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
print()
##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# plot wuth log Y axis scaling
plt.yscale('log')
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

Keras layer for slicing image data into sliding windows

I have a set of images, all of varying widths, but with fixed height set to 100 pixels and 3 channels of depth. My task is to classify if each vertical line in the image is interesting or not. To do that, I look at the line in context of its 10 predecessor and successor lines. Imagine the algorithm sweeping from left to right of the image, detecting vertical lines containing points of interest.
My first attempt at doing this was to manually cut out these sliding windows using numpy before feeding the data into the Keras model. Like this:
# Pad left and right
s = np.repeat(D[:1], 10, axis = 0)
e = np.repeat(D[-1:], 10, axis = 0)
# D now has shape (w + 20, 100, 3)
D = np.concatenate((s, D, e))
# Sliding windows creation trick from SO question
idx = np.arange(21)[None,:] + np.arange(len(D) - 20)[:,None]
windows = D[indexer]
Then all windows and all ground truth 0/1 values for all vertical lines in all images would be concatenated into two very long arrays.
I have verified that this works, in principle. I fed each window to a Keras layer looking like this:
Conv2D(20, (5, 5), input_shape = (21, 100, 3), padding = 'valid', ...)
But the windowing causes the memory usage to increase 21 times so doing it this way becomes impractical. But I think my scenario is a very common in machine learning so there must be some standard method in Keras to do this efficiently? E.g I would like to feed Keras my raw image data (w, 100, 80) and tell it what the sliding window sizes are and let it figure out the rest. I have looked at some sample code but I'm a ml noob so I don't get it.
Unfortunately this isn't an easy problem because it can involve using a variable sized input for your Keras model. While I think it is possible to do this with proper use of placeholders that's certainly no place for a beginner to start. your other option is a data generator. As with many computationally intensive tasks there is often a trade off between compute speed and memory requirements, using a generator is more compute heavy and it will be done entirely on your cpu (no gpu acceleration), but it won't make the memory increase.
The point of a data generator is that it will apply the operation to images one at a time to produce the batch, then train on that batch, then free up the memory - so you only end up keeping one batch worth of data in memory at any time. Unfortunately if you have a time consuming generation then this can seriously affect performance.
The generator will be a python generator (using the 'yield' keyword) and is expected to produce a single batch of data, keras is very good at using arbitrary batch sizes, so you can always make one image yield one batch, especially to start.
Here is the keras page on fit_generator - I warn you, this starts to become a lot of work very quickly, consider buying more memory:
https://keras.io/models/model/#fit_generator
Fine I'll do it for you :P
import numpy as np
import pandas as pd
import keras
from keras.models import Model, model_from_json
from keras.layers import Dense, Concatenate, Multiply,Add, Subtract, Input, Dropout, Lambda, Conv1D, Flatten
from tensorflow.python.client import device_lib
# check for my gpu
print(device_lib.list_local_devices())
# make some fake image data
# 1000 random widths
data_widths = np.floor(np.random.random(1000)*100)
# producing 1000 random images with dimensions w x 100 x 3
# and a vector of which vertical lines are interesting
# I assume your data looks like this
images = []
interesting = []
for w in data_widths:
images.append(np.random.random([int(w),100,3]))
interesting.append(np.random.random(int(w))>0.5)
# this is a generator
def image_generator(images, interesting):
num = 0
while num < len(images):
windows = None
truth = None
D = images[num]
# this should look familiar
# Pad left and right
s = np.repeat(D[:1], 10, axis = 0)
e = np.repeat(D[-1:], 10, axis = 0)
# D now has shape (w + 20, 100, 3)
D = np.concatenate((s, D, e))
# Sliding windows creation trick from SO question
idx = np.arange(21)[None,:] + np.arange(len(D) - 20)[:,None]
windows = D[idx]
truth = np.expand_dims(1*interesting[num],axis=1)
yield (windows, truth)
num+=1
# the generator MUST loop
if num == len(images):
num = 0
# basic model - replace with your own
input_layer = Input(shape = (21,100,3), name = "input_node")
fc = Flatten()(input_layer)
fc = Dense(100, activation='relu',name = "fc1")(fc)
fc = Dense(50, activation='relu',name = "fc2")(fc)
fc = Dense(10, activation='relu',name = "fc3")(fc)
output_layer = Dense(1, activation='sigmoid',name = "output")(fc)
model = Model(input_layer,output_layer)
model.compile(optimizer="adam", loss='binary_crossentropy')
model.summary()
#and training
training_history = model.fit_generator(image_generator(images, interesting),
epochs =5,
initial_epoch = 0,
steps_per_epoch=len(images),
verbose=1
)

scikit learn: how to check coefficients significance

i tried to do a LR with SKLearn for a rather large dataset with ~600 dummy and only few interval variables (and 300 K lines in my dataset) and the resulting confusion matrix looks suspicious. I wanted to check the significance of the returned coefficients and ANOVA but I cannot find how to access it. Is it possible at all? And what is the best strategy for data that contains lots of dummy variables? Thanks a lot!
Scikit-learn deliberately does not support statistical inference. If you want out-of-the-box coefficients significance tests (and much more), you can use Logit estimator from Statsmodels. This package mimics interface glm models in R, so you could find it familiar.
If you still want to stick to scikit-learn LogisticRegression, you can use asymtotic approximation to distribution of maximum likelihiood estimates. Precisely, for a vector of maximum likelihood estimates theta, its variance-covariance matrix can be estimated as inverse(H), where H is the Hessian matrix of log-likelihood at theta. This is exactly what the function below does:
import numpy as np
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression
def logit_pvalue(model, x):
""" Calculate z-scores for scikit-learn LogisticRegression.
parameters:
model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
x: matrix on which the model was fit
This function uses asymtptics for maximum likelihood estimates.
"""
p = model.predict_proba(x)
n = len(p)
m = len(model.coef_[0]) + 1
coefs = np.concatenate([model.intercept_, model.coef_[0]])
x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1))
ans = np.zeros((m, m))
for i in range(n):
ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
vcov = np.linalg.inv(np.matrix(ans))
se = np.sqrt(np.diag(vcov))
t = coefs/se
p = (1 - norm.cdf(abs(t))) * 2
return p
# test p-values
x = np.arange(10)[:, np.newaxis]
y = np.array([0,0,0,1,0,0,1,1,1,1])
model = LogisticRegression(C=1e30).fit(x, y)
print(logit_pvalue(model, x))
# compare with statsmodels
import statsmodels.api as sm
sm_model = sm.Logit(y, sm.add_constant(x)).fit(disp=0)
print(sm_model.pvalues)
sm_model.summary()
The outputs of print() are identical, and they happen to be coefficient p-values.
[ 0.11413093 0.08779978]
[ 0.11413093 0.08779979]
sm_model.summary() also prints a nicely formatted HTML summary.

Resources