Using `pyhf` to float both signal and background strength - statistics

Hi pyhf users and developers!
I have a question that follows from a previous question and so I'll start with the anwser.py code provided in one of the responses and thena minor modification.
So I run the fit with the parameters given in the response but I then want to see the results of the fit, so I add some code to weight the original templates by the fit results and replot it with the overlays. The full script is below.
import pyhf
import pyhf.contrib.viz.brazil
import numpy as np
import matplotlib.pylab as plt
# - Get the uncertainties on the best fit signal strength
# - Calculate an 95% CL upper limit on the signal strength
tag = "ORIGINAL"
def plot_hist(ax, bins, data, bottom=0, color=None, label=None):
bin_width = bins[1] - bins[0]
bin_leftedges = bins[:-1]
bin_centers = [edge + bin_width / 2.0 for edge in bin_leftedges]
ax.bar(
bin_centers, data, bin_width, bottom=bottom, alpha=0.5, color=color, label=label
)
def plot_data(ax, bins, data, label="Data"):
bin_width = bins[1] - bins[0]
bin_leftedges = bins[:-1]
bin_centers = [edge + bin_width / 2.0 for edge in bin_leftedges]
ax.scatter(bin_centers, data, color="black", label=label)
def invert_interval(test_mus, hypo_tests, test_size=0.05):
# This will be taken care of in v0.5.3
cls_obs = np.array([test[0] for test in hypo_tests]).flatten()
cls_exp = [
np.array([test[1][idx] for test in hypo_tests]).flatten() for idx in range(5)
]
crossing_test_stats = {"exp": [], "obs": None}
for cls_exp_sigma in cls_exp:
crossing_test_stats["exp"].append(
np.interp(
test_size, list(reversed(cls_exp_sigma)), list(reversed(test_mus))
)
)
crossing_test_stats["obs"] = np.interp(
test_size, list(reversed(cls_obs)), list(reversed(test_mus))
)
return crossing_test_stats
def main():
np.random.seed(0)
pyhf.set_backend("numpy", "minuit")
observable_range = [0.0, 10.0]
bin_width = 0.5
_bins = np.arange(observable_range[0], observable_range[1] + bin_width, bin_width)
n_bkg = 2000
n_signal = int(np.sqrt(n_bkg))
# Generate simulation
bkg_simulation = 10 * np.random.random(n_bkg)
signal_simulation = np.random.normal(5, 1.0, n_signal)
bkg_sample, _ = np.histogram(bkg_simulation, bins=_bins)
signal_sample, _ = np.histogram(signal_simulation, bins=_bins)
# Generate observations
signal_events = np.random.normal(5, 1.0, int(n_signal * 0.8))
bkg_events = 10 * np.random.random(int(n_bkg + np.sqrt(n_bkg)))
observed_events = np.array(signal_events.tolist() + bkg_events.tolist())
observed_sample, _ = np.histogram(observed_events, bins=_bins)
# Visualize the simulation and observations
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
plot_hist(ax, _bins, bkg_sample, label="Background")
plot_hist(ax, _bins, signal_sample, bottom=bkg_sample, label="Signal")
plot_data(ax, _bins, observed_sample)
ax.legend(loc="best")
ax.set_ylim(top=np.max(observed_sample) * 1.4)
ax.set_xlabel("Observable")
ax.set_ylabel("Count")
fig.savefig("components_{0}.png".format(tag))
# Build the model
bkg_uncerts = np.sqrt(bkg_sample)
model = pyhf.simplemodels.hepdata_like(
signal_data=signal_sample.tolist(),
bkg_data=bkg_sample.tolist(),
bkg_uncerts=bkg_uncerts.tolist(),
)
data = pyhf.tensorlib.astensor(observed_sample.tolist() + model.config.auxdata)
# Perform inference
fit_result = pyhf.infer.mle.fit(data, model, return_uncertainties=True)
bestfit_pars, par_uncerts = fit_result.T
print(
f"best fit parameters:\
\n * signal strength: {bestfit_pars[0]} +/- {par_uncerts[0]}\
\n * nuisance parameters: {bestfit_pars[1:]}\
\n * nuisance parameter uncertainties: {par_uncerts[1:]}"
)
# Visualize the results
fit_bkg_sample = []
for w,b in zip(bestfit_pars[1:],bkg_sample):
fit_bkg_sample.append(w*b)
fit_signal_sample = bestfit_pars[0]*np.array(signal_sample)
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
plot_hist(ax, _bins, fit_bkg_sample, label="Background")
plot_hist(ax, _bins, fit_signal_sample, bottom=fit_bkg_sample, label="Signal")
plot_data(ax, _bins, observed_sample)
ax.legend(loc="best")
ax.set_ylim(top=np.max(observed_sample) * 1.4)
ax.set_xlabel("Observable")
ax.set_ylabel("Count")
fig.savefig("components_after_fit_{0}.png".format(tag))
# Perform hypothesis test scan
_start = 0.0
_stop = 5
_step = 0.1
poi_tests = np.arange(_start, _stop + _step, _step)
print("\nPerforming hypothesis tests\n")
hypo_tests = [
pyhf.infer.hypotest(
mu_test,
data,
model,
return_expected_set=True,
return_test_statistics=True,
qtilde=True,
)
for mu_test in poi_tests
]
# Upper limits on signal strength
results = invert_interval(poi_tests, hypo_tests)
print(f"Observed Limit on µ: {results['obs']:.2f}")
print("-----")
for idx, n_sigma in enumerate(np.arange(-2, 3)):
print(
"Expected {}Limit on µ: {:.3f}".format(
" " if n_sigma == 0 else "({} σ) ".format(n_sigma),
results["exp"][idx],
)
)
# Visualize the "Brazil band"
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
ax.set_title("Hypothesis Tests")
ax.set_ylabel(r"$\mathrm{CL}_{s}$")
ax.set_xlabel(r"$\mu$")
pyhf.contrib.viz.brazil.plot_results(ax, poi_tests, hypo_tests)
fig.savefig("brazil_band_{0}.png".format(tag))
if __name__ == "__main__":
main()
When I run it, I get the following plots. The first is original observations/simulation and the second plot has the simulation scaled by the fit results.
So this all looks good and I think I understand what his happening.
But now I ask a question, which is better illustrated with a slight variation on your example.
I'm going to modify the generated simulations and observations such that there are a different number of background events in the simulations and samples. I'm also making the signal more significant. This would be an example where I've not been able to get a good estimation of the background contribution(s) prior to doing the fit. In the example you provide, the number of background events is the same for the simulated sample and data, which is not going to be the case in the real world.
So I go to the above code and I change these lines.
n_bkg = 2000
n_signal = 200
# Generate simulation
bkg_simulation = 10 * np.random.random(n_bkg)
signal_simulation = np.random.normal(5, 1.0, n_signal)
bkg_sample, _ = np.histogram(bkg_simulation, bins=_bins)
signal_sample, _ = np.histogram(signal_simulation, bins=_bins)
# Generate observations
signal_events = np.random.normal(5, 1.0, int(n_signal * 0.8))
bkg_events = 10 * np.random.random(n_bkg - 300)
The fit isn't great, and I wouldn't expect it to be since I locked down the number of background events, modulo the Poisson fluctuations in each bin. The relevant plots (before/after fit) are shown here.
I might have thought another way to approach this would be to add another non-nuisance, floating parameter that represents the background strength, while still letting the individual bins vary within Poisson fluctuations. For that matter, couldn't (shouldn't?) the signal bins fluctuate as well?
In that case, I would then start with my a vastly larger number of data points in my simulated samples to get the more "true" (I know that's not rigorous) distribution. Once the fit drives the number of signal/background events down, Poisson fluctuations would become more significant.
I'm sure the optimization/minimization of the likelihood function becomes much more difficult but it also feels like we're constraining the fit too early if we lock down the bulk background normalization. Or maybe I'm missing something?
Thanks as always for your help and response!

you should be able to add a new nuisance by adding a "normfactor" modifier to the background component
e.g.
spec = {'channels': [{'name': 'singlechannel',
'samples': [{'name': 'signal',
'data': [10.0],
'modifiers': [{'name': 'mu', 'type': 'normfactor', 'data': None}]},
{'name': 'background',
'data': [50.0],
'modifiers': [{'name': 'bkgnorm',
'type': 'normfactor',
'data': None}]}]}]}
see the symmetry between signal and background

Related

Trying to rule out astrology but something is wrong

I am trying to rule out a possible astrology effect on populations as a statistically insignificant effect but to no avail. I am using Pearson's Chi Square test on two distributions of sun signs from two different populations one of astronaut pilots and the other one of celebrities. Something must be wrong but I failed to find it, probably on the statistics side.
import numpy as np
import pandas as pd
import ephem
from collections import Counter, namedtuple
import matplotlib.pyplot as plt
from scipy import stats
models = pd.read_csv('models.csv', delimiter=',')
astronauts = pd.read_csv('astronauts.csv', delimiter=',')
models = models.sample(229)
astronauts = astronauts.sample(229)
sun = ephem.Sun()
def get_planet_constellation(planet, dataset):
person_planet_constellation = []
for person in dataset['Birth Date']:
planet.compute(person)
person_planet_constellation += [ephem.constellation(planet)[1]]
return person_planet_constellation
def plot_bar_group(planet, data1, data2):
fig, ax = plt.subplots()
plt.bar(data1.keys(), data1.values(), alpha=0.5)
plt.bar(data2.keys(), data2.values(), alpha=0.5)
plt.legend(['astronauts', 'models'])
ylabel = 'Percentages of ' + planet.name + ' in constellation'
ax.set_ylabel(ylabel)
title = 'Histogram of ' + planet.name + ' in constellation by group'
ax.set_title(title)
plt.show()
astronaut_sun_constellation = Counter(
get_planet_constellation(sun, astronauts))
model_sun_constellation = Counter(get_planet_constellation(sun, models))
plot_bar_group(sun, astronaut_sun_constellation, model_sun_constellation)
a = list(astronaut_sun_constellation.values())
b = list(model_sun_constellation.values())
s = np.array([a, b])
stat, p, dof, expected = stats.chi2_contingency(s)
print(stat, p, dof, expected)
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
if p <= alpha:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
https://www.dropbox.com/s/w7rye6m5lbihjlh/astronauts.csv
https://www.dropbox.com/s/xlxanr0pxqtxcvv/models.csv
I have eventually found the bug, it was on passing the counter as a list to the chisquare function, it must be sorted first, otherwise chisquare sees a major difference in the counters values. All astrology effects now are insignificant as expected at the level of 0.95

Sort simmilarity matrix according to plot colors

I have this similarity matrix plot of some documents. I want to sort the values of the matrix, which is a numpynd array, to group colors, while maintaining their relative position (diagonal yellow line), and labels as well.
path = "C:\\Users\\user\\Desktop\\texts\\dataset"
text_files = os.listdir(path)
#print (text_files)
tfidf_vectorizer = TfidfVectorizer()
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]
sparse_matrix = tfidf_vectorizer.fit_transform(documents)
labels = []
for f in text_files:
if f.endswith('.txt'):
labels.append(f)
pairwise_similarity = sparse_matrix * sparse_matrix.T
pairwise_similarity_array = pairwise_similarity.toarray()
fig, ax = plt.subplots(figsize=(20,20))
cax = ax.matshow(pairwise_similarity_array, interpolation='spline16')
ax.grid(True)
plt.title('News articles similarity matrix')
plt.xticks(range(23), labels, rotation=90);
plt.yticks(range(23), labels);
fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
plt.show()
Here is one possibility.
The idea is to use the information in the similarity matrix and put elements next to each other if they are similar. If two items are similar they should also be similar with respect to other elements ie have similar colors.
I start with the element which has the most in common with all other elements (this choice is a bit arbitrary) [a] and as next element I choose from the remaining elements the one which is closest to the current [b].
import numpy as np
import matplotlib.pyplot as plt
def create_dummy_sim_mat(n):
sm = np.random.random((n, n))
sm = (sm + sm.T) / 2
sm[range(n), range(n)] = 1
return sm
def argsort_sim_mat(sm):
idx = [np.argmax(np.sum(sm, axis=1))] # a
for i in range(1, len(sm)):
sm_i = sm[idx[-1]].copy()
sm_i[idx] = -1
idx.append(np.argmax(sm_i)) # b
return np.array(idx)
n = 10
sim_mat = create_dummy_sim_mat(n=n)
idx = argsort_sim_mat(sim_mat)
sim_mat2 = sim_mat[idx, :][:, idx] # apply reordering for rows and columns
# Plot results
fig, ax = plt.subplots(1, 2)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat2)
def ticks(_ax, ti, la):
_ax.set_xticks(ti)
_ax.set_yticks(ti)
_ax.set_xticklabels(la)
_ax.set_yticklabels(la)
ticks(_ax=ax[0], ti=range(n), la=range(n))
ticks(_ax=ax[1], ti=range(n), la=idx)
After meTchaikovsky's answer I also tested my idea on a clustered similarity matrix (see first image) this method works but is not perfect (see second image).
Because I use the similarity between two elements as approximation to their similarity to all other elements, it is quite clear why this does not work perfectly.
So instead of using the initial similarity to sort the elements one could calculate a second order similarity matrix which measures how similar the similarities are (sorry).
This measure describes better what you are interested in. If two rows / columns have similar colors they should be close to each other. The algorithm to sort the matrix is the same as before
def add_cluster(sm, c=3):
idx_cluster = np.array_split(np.random.permutation(np.arange(len(sm))), c)
for ic in idx_cluster:
cluster_noise = np.random.uniform(0.9, 1.0, (len(ic),)*2)
sm[ic[np.newaxis, :], ic[:, np.newaxis]] = cluster_noise
def get_sim_mat2(sm):
return 1 / (np.linalg.norm(sm[:, np.newaxis] - sm[np.newaxis], axis=-1) + 1/n)
sim_mat = create_dummy_sim_mat(n=100)
add_cluster(sim_mat, c=4)
sim_mat2 = get_sim_mat2(sim_mat)
idx = argsort_sim_mat(sim_mat)
idx2 = argsort_sim_mat(sim_mat2)
sim_mat_sorted = sim_mat[idx, :][:, idx]
sim_mat_sorted2 = sim_mat[idx2, :][:, idx2]
# Plot results
fig, ax = plt.subplots(1, 3)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(sim_mat_sorted2)
The results with this second method are quite good (see third image)
but I guess there exist cases where this approach also fails, so I would be happy about feedback.
Edit
I tried to explain it and did also link the ideas to the code with [a] and [b], but obviously I did not do a good job, so here is a second more verbose explanation.
You have n elements and a n x n similarity matrix sm where each cell (i, j) describes how similar element i is to element j. The goal is to order the rows / columns in such a way that one can see existing patterns in the similarity matrix. My idea to achieve this is really simple.
You start with an empty list and add elements one by one. The criterion for the next element is the similarity to the current element. If element i was added in the last step, I chose the element argmax(sm[i, :]) as next, ignoring the elements already added to the list. I ignore the elements by setting the values of those elements to -1.
You can use the function ticks to reorder the labels:
labels = np.array(labels) # make labels an numpy array, to index it with a list
ticks(_ax=ax[0], ti=range(n), la=labels[idx])
#scleronomic's solution is very elegant, but it also has one shortage, which is we cannot set the number of clusters in the sorted correlation matrix. Assume we are working with a set of variables, in which some of them are weakly correlated
import string
import numpy as np
import pandas as pd
n_variables = 20
n_clusters = 10
n_samples = 100
np.random.seed(100)
names = list(string.ascii_lowercase)[:n_variables]
belongs_to_cluster = np.random.randint(0,n_clusters,n_variables)
latent = np.random.randn(n_clusters,n_samples)
variables = np.random.rand(n_variables,n_samples)
for ind in range(n_clusters):
mask = belongs_to_cluster == ind
# weakening the correlation
if ind % 2 == 0:variables[mask] += latent[ind]*0.1
variables[mask] += latent[ind]
df = pd.DataFrame({key:val for key,val in zip(names,variables)})
corr_mat = np.array(df.corr())
As you can see, there are 10 clusters of variables by construction, however, variables within clusters that has an even index are weakly correlated. If we only want to see roughly 5 clusters in the sorted correlation matrix, maybe we need to find another way.
Based on this post, which is the accepted answer to the question "Clustering a correlation matrix", to sort a correlation matrix into blocks, what we need to find are blocks, where correlations within blocks are high and correlations between blocks are low. However, the solution provided by this accepted answer works best when we know how many blocks are there in the first place, and more importantly, the sizes of the underlying blocks are the same, or at least similar. Therefore, I improved the solution with a new function sort_corr_mat
def sort_corr_mat(corr_mat,clusters_guess):
def _swap_rows(corr_mat, var1, var2):
rs = corr_mat.copy()
rs[var2, :],rs[var1, :]= corr_mat[var1, :],corr_mat[var2, :]
cs = rs.copy()
cs[:, var2],cs[:, var1] = rs[:, var1],rs[:, var2]
return cs
# analysis
max_iter = 500
best_score,current_score,best_count = -1e8,-1e8,0
num_minimua_to_visit = 20
best_corr = corr_mat
best_ordering = np.arange(n_variables)
for i in range(max_iter):
for row1 in range(n_variables):
for row2 in range(n_variables):
if row1 == row2: continue
option_ordering = best_ordering.copy()
option_ordering[row1],option_ordering[row2] = best_ordering[row2],best_ordering[row1]
option_corr = _swap_rows(best_corr,row1,row2)
option_score = score(option_corr,n_variables,clusters_guess)
if option_score > best_score:
best_corr = option_corr
best_ordering = option_ordering
best_score = option_score
if best_score > current_score:
best_count += 1
current_corr = best_corr
current_ordering = best_ordering
current_score = best_score
if best_count >= num_minimua_to_visit:
return best_corr#,best_ordering
return best_corr#,best_ordering
With this function and the corr_mat constructed in the first place, I compared the result obtained with my function (on the right) with that obtained with #scleronomic's solution (in the middle)
sim_mat_sorted = corr_mat[argsort_sim_mat(corr_mat), :][:, argsort_sim_mat(corr_mat)]
corr_mat_sorted = sort_corr_mat(corr_mat,clusters_guess=5)
# Plot results
fig, ax = plt.subplots(1,3,figsize=(18,6))
ax[0].imshow(corr_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(corr_mat_sorted)
Clearly, #scleronomic's solution works much better and faster, but my solution offers more control to the pattern of the output.

Adding minor tick marks to a histogram

I am working through this:
https://medium.com/diogo-menezes-borges/introduction-to-statistics-for-data-science-6c246ed2468d
About 3/4 of the way through there is a histogram, but the author does not supply the code used to generate it.
So I decided to give it a go...
I have everything working, but I would like to add minor ticks to my plot.
X-axis only, spaced 200 units apart (matching the bin width used in my code).
In particular, I would like to add minor ticks in the style from the last example from here:
https://matplotlib.org/3.1.0/gallery/ticks_and_spines/major_minor_demo.html
I have tried several times but I just can't get that exact 'style' to work on my plot.
Here is my working code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
print('NumPy: {}'.format(np.__version__))
print('Pandas: {}'.format(pd.__version__))
print('\033[1;31m' + '--------------' + '\033[0m') # Bold red
display_settings = {
'max_columns': 15,
'max_colwidth': 60,
'expand_frame_repr': False, # Wrap to multiple pages
'max_rows': 50,
'precision': 6,
'show_dimensions': False
}
# pd.options.display.float_format = '{:,.2f}'.format
for op, value in display_settings.items():
pd.set_option("display.{}".format(op), value)
file = "e:\\python\\pandas\\medium\\sets.csv"
lego = pd.read_csv(file, encoding="utf-8")
print(lego.shape, '\n')
print(lego.info(), '\n')
print(lego.head(), '\n')
print(lego.isnull().sum(), '\n')
dfs = [lego]
names = ['lego']
def NaN_percent(_df, column_name):
# empty_values = row_count - _df[column_name].count()
empty_values = _df[column_name].isnull().sum()
return (100.0 * empty_values)/row_count
c = 0
print('Columns with missing values expressed as a percentage.')
for df in dfs:
print('\033[1;31m' + ' ' + names[c] + '\033[0m')
row_count = df.shape[0]
for i in list(df):
x = NaN_percent(df, i)
if x > 0:
print(' ' + i + ': ' + str(x.round(4)) + '%')
c += 1
print()
# What is the average number of parts in the sets of legos?
print(lego['num_parts'].mean(), '\n')
# What is the median number of parts in the sets of legos?
print(lego['num_parts'].median(), '\n')
print(lego['num_parts'].max(), '\n')
# Create Bins for Data Ranges
bins = []
for i in range(lego['num_parts'].min(), 6000, 200):
bins.append(i + 1)
# Use 'right' to determine which bin overlapping values fall into.
cuts = pd.cut(lego['num_parts'], bins=bins, right=False)
# Count values in each bin.
print(cuts.value_counts(), '\n')
plt.hist(lego['num_parts'], color='red', edgecolor='black', bins=bins)
plt.title('Histogram of Number of parts')
plt.xlabel('Bin')
plt.ylabel('Number of values per bin')
plt.axvline(x=162.2624, color='blue')
plt.axvline(x=45.0, color='green', linestyle='--')
# https://matplotlib.org/gallery/text_labels_and_annotations/custom_legends.html
legend_elements = [Line2D([0], [0], color='blue', linewidth=2, linestyle='-'),
Line2D([0], [1], color='green', linewidth=2, linestyle='--')
]
labels = ['mean: 162.2624', 'median: 45.0']
plt.legend(legend_elements, labels)
plt.show()
You can just add:
ax = plt.gca()
ax.xaxis.set_minor_locator(AutoMinorLocator())
ax.tick_params(which='minor', length=4, color='r')
See this post to get a better idea about the difference between plt, ax and fig. In broad terms, plt refers to the pyplot library of matplotlib. fig is one "plot" that can consist of one or more subplots. ax refers to one subplot and the x and y-axis defined for them, including the measuring units, tick marks, tick labels etc.. Many function in matplotlib are often called as plt.hist, but in the underlying code they are drawing on the "current axes". These axes can be obtained via plt.gca() or "get current axes". It is not always clear which functions can be called via plt. and which only exist via ax.. Also, sometimes the get slightly different names. You'll need to look in the documentation or search StackOverflow which form is needed in each specific case.

Recover elements from each cluster generated by scipy dendrogram

I'm building a dendrogram and truncating it to show only the largest 6 clusters. Also, the labeling is done via a simple leaf label function:
def llf(id):
return str(id)
tree = sch.dendrogram(Z, truncate_mode='lastp',
leaf_label_func=llf, p=6, show_contracted=False,
show_leaf_counts=False, leaf_rotation=90,
no_labels = False, orientation='right')
My output looks like this:
My goal is to replace the non descriptive labels for the leaves with the minimum value of the members from within that cluster. For example, if the top leaf is the cluster that contains the range from 10 to 1000, then I would like to replace '2468' with 10. The actual logic to replace the ticks in the plot is easy to implement:
fig, ax = plt.subplots()
mislabels = ["foo" for i in range(7)]
ax.set_xticklabels(mislabels, fontsize=10, rotation=45)
Any ideas regarding how to extract the values from within the leaders?
So far I'm able to map each singleton leaf to its cluster using fcluster. However, that only maps my initial 1230 points to a cluster. I need to map the point labeled as '2468' to its cluster and I'm not sure how to do that.
Thanks!
I found the way to do it
fig, ax = plt.subplots(2,2,figsize=(10,5))
ax = ax.ravel()
# [idx_plot[k]:, idx_plot[k]:]
for k, val in enumerate(linkages['ward']):
cluster_local = cluster_labels[val]['ward'][6]
leaders = sch.leaders(linkages['ward'][val], cluster_local)
dates_labels = dict()
for v, i in enumerate(leaders[1]):
date_idx = np.where(cluster_local == i)
dates_labels[leaders[0][v]] = (fechas[val][idx_plot[val]:][date_idx[0][0]].strftime('%y/%m'), fechas[val][idx_plot[val]:][date_idx[0][-1]].strftime('%y/%m'))
mislabels = [dates_labels[leaders[0][i]][0] + ', ' + dates_labels[leaders[0][i]][1] for i in range(6)]
yuca = sch.dendrogram(linkages['ward'][val], truncate_mode='lastp', ax=ax[k], leaf_label_func=llf, p=6, show_contracted=False, show_leaf_counts=False,
leaf_rotation=0, no_labels=False, orientation = 'right' )
# ax[k].set_xticklabels(mislabels, fontsize=10, rotation=90)
ax[k].set_yticklabels(mislabels, fontsize=10, rotation=0)
ax[k].set_title(val)
plt.tight_layout()
plt.show()

SparsePCA in sklearn not working properly?

First let me clarify that here "sparse PCA" means PCA with L1 penalty and sparse loadings, not PCA on sparse matrix.
I've read the paper on sparse PCA by Zou and Hastie, I've read the documentation on sklearn.decomposition.SparsePCA, and I know how to use PCA, but I can't seem to get the right result from SparsePCA.
Namely, when L1 penalty is 0, the result from SparsePCA is supposed to agree with PCA, but the loadings differ quite a lot. To make sure that I didn't mess up any hyperparameters, I used the same hyperparameters (convergence tolerance, maximum iterations, ridge penalty, lasso penalty...) in R with 'spca' from 'elasticnet', and R gave me the correct result. I'd rather not have to go through the source code of SparsePCA if anyone has experience using this function and could let me know if I made any mistakes.
Below is how I generated my dataset. It's a bit convoluted because I wanted a specific Markov Decision Process to test some reinforcement learning algorithms. Just treat it as some non-sparse dataset.
import numpy as np
from sklearn.decomposition import PCA, SparsePCA
import numpy.random as nr
def transform(data, TranType=None):
if TranType == 'quad':
data = np.minimum(np.square(data), 3)
if TranType == 'cubic':
data = np.maximum(np.minimum(np.power(data, 3), 3), -3)
if TranType == 'exp':
data = np.minimum(np.exp(data), 3)
if TranType == 'abslog':
data = np.minimum(np.log(abs(data)), 3)
return data
def NewStateGen(OldS, A, TranType, m=0, sd=0.5, nsd=0.1, dim=64):
# dim needs to be a multiple of 4, and preferably a multiple of 16.
assert (dim == len(OldS) and dim % 4 == 0)
TrueDim = dim / 4
NewS = np.zeros(dim)
# Generate new state according to action
if A == 0:
NewS[range(0, dim, 4)] = transform(OldS[0:TrueDim], TranType) + \
nr.normal(scale=nsd, size=TrueDim)
NewS[range(1, dim, 4)] = transform(OldS[0:TrueDim], TranType) + \
nr.normal(scale=nsd, size=TrueDim)
NewS[range(2, dim, 4)] = nr.normal(m, sd, size=TrueDim)
NewS[range(3, dim, 4)] = nr.normal(m, sd, size=TrueDim)
R = 2 * np.sum(transform(OldS[0:int(np.ceil(dim / 32.0))], TranType)) - \
np.sum(transform(OldS[int(np.ceil(dim / 32.0)):(dim / 16)], TranType)) + \
nr.normal(scale=nsd)
if A == 1:
NewS[range(0, dim, 4)] = nr.normal(m, sd, size=TrueDim)
NewS[range(1, dim, 4)] = nr.normal(m, sd, size=TrueDim)
NewS[range(2, dim, 4)] = transform(OldS[0:TrueDim], TranType) + \
nr.normal(scale=nsd, size=TrueDim)
NewS[range(3, dim, 4)] = transform(OldS[0:TrueDim], TranType) + \
nr.normal(scale=nsd, size=TrueDim)
R = 2 * np.sum(transform(OldS[int(np.floor(dim / 32.0)):(dim / 16)], TranType)) - \
np.sum(transform(OldS[0:int(np.floor(dim / 32.0))], TranType)) + \
nr.normal(scale=nsd)
return NewS, R
def MDPGen(dim=64, rep=1, n=30, T=100, m=0, sd=0.5, nsd=0.1, TranType=None):
X_all = np.zeros(shape=(rep*n*T, dim))
Y_all = np.zeros(shape=(rep*n*T, dim+1))
A_all = np.zeros(rep*n*T)
R_all = np.zeros(rep*n*T)
for j in xrange(rep*n):
# Data for a single subject
X = np.zeros(shape=(T+1, dim))
A = np.zeros(T)
R = np.zeros(T)
NewS = np.zeros(dim)
X[0] = nr.normal(m, sd, size=dim)
for i in xrange(T):
OldS = X[i]
# Pick a random action
A[i] = nr.randint(2)
# Generate new state according to action
X[i+1], R[i] = NewStateGen(OldS, A[i], TranType, m, sd, nsd, dim)
Y = np.concatenate((X[1:(T+1)], R.reshape(T, 1)), axis=1)
X = X[0:T]
X_all[(j*T):((j+1)*T)] = X
Y_all[(j*T):((j+1)*T)] = Y
A_all[(j*T):((j+1)*T)] = A
R_all[(j*T):((j+1)*T)] = R
return {'X': X_all, 'Y': Y_all, 'A': A_all, 'R': R_all, 'rep': rep, 'n': n, 'T': T}
nr.seed(1)
MDP = MDPGen(dim=64, rep=1, n=30, T=90, sd=0.5, nsd=0.1, TranType=None)
X = MDP.get('X').astype(np.float32)
Now I run PCA and SparsePCA. When the lasso penalty, 'alpha', is 0, SparsePCA is supposed to give the same result as PCA, which is not the case. The other hyperparameters are set with the default values from elasticnet in R. If I use the default from SparsePCA the result will still be incorrect.
PCA_model = PCA(n_components=64)
PCA_model.fit(X)
Z = PCA_model.transform(X)
SPCA_model = SparsePCA(n_components=64, alpha=0, ridge_alpha=1e-6, max_iter=200, tol=1e-3)
SPCA_model.fit(X)
SZ = SPCA_model.transform(X)
# Check the first 2 loadings from PCA and SPCA. They are supposed to agree.
print PCA_model.components_[0:2]
print SPCA_model.components_[0:2]
# Check the first 2 observations of transformed data. They are supposed to agree.
print Z[0:2]
print SZ[0:2]
When the lasso penalty is greater than 0, the result from SparsePCA is still quite different from what R gives me, and the latter is correct based on manual inspection and what I learned from the original paper. So, is SparsePCA broken, or did I miss anything?
As often: there are many different formulations & implementations.
sklearn is using a different implementation with different characteristics.
Let's have a look how they differ:
sklearn: (reference within user-guide)
Elasticnet: (Zou et. al. paper)
So it seems sklearn is at least doing something different in regards to the l2-norm based component (it's missing).
This is by design as this is the basic form within the area of dictionary-learning: (algorithm-paper linked by sklearn used for implementation).
It is quite possible, that this alternative formulation is not guaranteeing (or does not care at all) to emulate classic PCA when the sparsity-parameter is zero (which is not really surprising as these problems differ a lot in regards to optimization-theory and sparsePCA has to reside to some heuristic-based algorithm as the problem itself is NP-hard, ref). This idea is strengthened by the describing of the equivalence theorem here:
The answers aren't different. First, I thought it may be the solvers, but checking for different solvers, I get almost identical loadings. See this:
nr.seed(1)
MDP = MDPGen(dim=16, rep=1, n=30, T=90, sd=0.5, nsd=0.1, TranType=None)
X = MDP.get('X').astype(np.float32)
PCA_model = PCA(n_components=10,svd_solver='auto',tol=1e-6)
PCA_model.fit(X)
SPCA_model = SparsePCA(n_components=10, alpha=0, ridge_alpha=0)
SPCA_model.fit(X)
PC1 = PCA_model.components_[0]/np.linalg.norm(PCA_model.components_[0])
SPC1 = SPCA_model.components_[0].T/np.linalg.norm(SPCA_model.components_[0])
print(np.dot(PC1,SPC1))
import pylab
pylab.plot(PC1)
pylab.plot(SPC1)
pylab.show()

Resources