Recover elements from each cluster generated by scipy dendrogram - python-3.x

I'm building a dendrogram and truncating it to show only the largest 6 clusters. Also, the labeling is done via a simple leaf label function:
def llf(id):
return str(id)
tree = sch.dendrogram(Z, truncate_mode='lastp',
leaf_label_func=llf, p=6, show_contracted=False,
show_leaf_counts=False, leaf_rotation=90,
no_labels = False, orientation='right')
My output looks like this:
My goal is to replace the non descriptive labels for the leaves with the minimum value of the members from within that cluster. For example, if the top leaf is the cluster that contains the range from 10 to 1000, then I would like to replace '2468' with 10. The actual logic to replace the ticks in the plot is easy to implement:
fig, ax = plt.subplots()
mislabels = ["foo" for i in range(7)]
ax.set_xticklabels(mislabels, fontsize=10, rotation=45)
Any ideas regarding how to extract the values from within the leaders?
So far I'm able to map each singleton leaf to its cluster using fcluster. However, that only maps my initial 1230 points to a cluster. I need to map the point labeled as '2468' to its cluster and I'm not sure how to do that.
Thanks!

I found the way to do it
fig, ax = plt.subplots(2,2,figsize=(10,5))
ax = ax.ravel()
# [idx_plot[k]:, idx_plot[k]:]
for k, val in enumerate(linkages['ward']):
cluster_local = cluster_labels[val]['ward'][6]
leaders = sch.leaders(linkages['ward'][val], cluster_local)
dates_labels = dict()
for v, i in enumerate(leaders[1]):
date_idx = np.where(cluster_local == i)
dates_labels[leaders[0][v]] = (fechas[val][idx_plot[val]:][date_idx[0][0]].strftime('%y/%m'), fechas[val][idx_plot[val]:][date_idx[0][-1]].strftime('%y/%m'))
mislabels = [dates_labels[leaders[0][i]][0] + ', ' + dates_labels[leaders[0][i]][1] for i in range(6)]
yuca = sch.dendrogram(linkages['ward'][val], truncate_mode='lastp', ax=ax[k], leaf_label_func=llf, p=6, show_contracted=False, show_leaf_counts=False,
leaf_rotation=0, no_labels=False, orientation = 'right' )
# ax[k].set_xticklabels(mislabels, fontsize=10, rotation=90)
ax[k].set_yticklabels(mislabels, fontsize=10, rotation=0)
ax[k].set_title(val)
plt.tight_layout()
plt.show()

Related

How could I edit my code to plot 4D contour something similar to this example in python?

Similar to many other researchers on stackoverflow who are trying to plot a contour graph out of 4D data (i.e., X,Y,Z and their corresponding value C), I am attempting to plot a 4D contour map out of my data. I have tried many of the suggested solutions in stackover flow. From all of the plots suggested this, and this were the closest to what I want but sill not quite what I need in terms of data interpretation. Here is the ideal plot example: (source)
Here is a subset of the data. I put it on the dropbox. Once this data is downloaded to the directory of the python file, the following code will work. I have modified this script from this post.
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.tri as mtri
#####Importing the data
df = pd.read_csv('Data_4D_plot.csv')
do_random_pt_example = False;
index_x = 0; index_y = 1; index_z = 2; index_c = 3;
list_name_variables = ['x', 'y', 'z', 'c'];
name_color_map = 'seismic';
if do_random_pt_example:
number_of_points = 200;
x = np.random.rand(number_of_points);
y = np.random.rand(number_of_points);
z = np.random.rand(number_of_points);
c = np.random.rand(number_of_points);
else:
x = df['X'].to_numpy();
y = df['Y'].to_numpy();
z = df['Z'].to_numpy();
c = df['C'].to_numpy();
#end
#-----
# We create triangles that join 3 pt at a time and where their colors will be
# determined by the values of their 4th dimension. Each triangle contains 3
# indexes corresponding to the line number of the points to be grouped.
# Therefore, different methods can be used to define the value that
# will represent the 3 grouped points and I put some examples.
triangles = mtri.Triangulation(x, y).triangles;
choice_calcuation_colors = 2;
if choice_calcuation_colors == 1: # Mean of the "c" values of the 3 pt of the triangle
colors = np.mean( [c[triangles[:,0]], c[triangles[:,1]], c[triangles[:,2]]], axis = 0);
elif choice_calcuation_colors == 2: # Mediane of the "c" values of the 3 pt of the triangle
colors = np.median( [c[triangles[:,0]], c[triangles[:,1]], c[triangles[:,2]]], axis = 0);
elif choice_calcuation_colors == 3: # Max of the "c" values of the 3 pt of the triangle
colors = np.max( [c[triangles[:,0]], c[triangles[:,1]], c[triangles[:,2]]], axis = 0);
#end
#----------
###=====adjust this part for the labeling of the graph
list_name_variables[index_x] = 'X (m)'
list_name_variables[index_y] = 'Y (m)'
list_name_variables[index_z] = 'Z (m)'
list_name_variables[index_c] = 'C values'
# Displays the 4D graphic.
fig = plt.figure(figsize = (15,15));
ax = fig.gca(projection='3d');
triang = mtri.Triangulation(x, y, triangles);
surf = ax.plot_trisurf(triang, z, cmap = name_color_map, shade=False, linewidth=0.2);
surf.set_array(colors); surf.autoscale();
#Add a color bar with a title to explain which variable is represented by the color.
cbar = fig.colorbar(surf, shrink=0.5, aspect=5);
cbar.ax.get_yaxis().labelpad = 15; cbar.ax.set_ylabel(list_name_variables[index_c], rotation = 270);
# Add titles to the axes and a title in the figure.
ax.set_xlabel(list_name_variables[index_x]); ax.set_ylabel(list_name_variables[index_y]);
ax.set_zlabel(list_name_variables[index_z]);
ax.view_init(elev=15., azim=45)
plt.show()
Here would be the output:
Although it looks brilliant, it is not quite what I am looking for (the above contour map example). I have modified the following script from this post in the hope to reach the required graph, however, the chart looks nothing similar to what I was expecting (something similar to the previous output graph). Warning: the following code may take some time to run.
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
df = pd.read_csv('Data_4D_plot.csv')
x = df['X'].to_numpy();
y = df['Y'].to_numpy();
z = df['Z'].to_numpy();
cc = df['C'].to_numpy();
# convert to 2d matrices
Z = np.outer(z.T, z)
X, Y = np.meshgrid(x, y)
C = np.outer(cc.T,cc)
# fourth dimention - colormap
# create colormap according to cc-value
color_dimension = C # change to desired fourth dimension
minn, maxx = color_dimension.min(), color_dimension.max()
norm = matplotlib.colors.Normalize(minn, maxx)
m = plt.cm.ScalarMappable(norm=norm, cmap='jet')
m.set_array([])
fcolors = m.to_rgba(color_dimension)
# plot
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(X,Y,Z, rstride=1, cstride=1, facecolors=fcolors, vmin=minn, vmax=maxx, shade=False)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.show()
Now I was wondering from our kind community and experts if you can help me to plot a contour figure similar to the example graph (image one in this post), where the contours are based on the values within the range of C?

Sort simmilarity matrix according to plot colors

I have this similarity matrix plot of some documents. I want to sort the values of the matrix, which is a numpynd array, to group colors, while maintaining their relative position (diagonal yellow line), and labels as well.
path = "C:\\Users\\user\\Desktop\\texts\\dataset"
text_files = os.listdir(path)
#print (text_files)
tfidf_vectorizer = TfidfVectorizer()
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]
sparse_matrix = tfidf_vectorizer.fit_transform(documents)
labels = []
for f in text_files:
if f.endswith('.txt'):
labels.append(f)
pairwise_similarity = sparse_matrix * sparse_matrix.T
pairwise_similarity_array = pairwise_similarity.toarray()
fig, ax = plt.subplots(figsize=(20,20))
cax = ax.matshow(pairwise_similarity_array, interpolation='spline16')
ax.grid(True)
plt.title('News articles similarity matrix')
plt.xticks(range(23), labels, rotation=90);
plt.yticks(range(23), labels);
fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
plt.show()
Here is one possibility.
The idea is to use the information in the similarity matrix and put elements next to each other if they are similar. If two items are similar they should also be similar with respect to other elements ie have similar colors.
I start with the element which has the most in common with all other elements (this choice is a bit arbitrary) [a] and as next element I choose from the remaining elements the one which is closest to the current [b].
import numpy as np
import matplotlib.pyplot as plt
def create_dummy_sim_mat(n):
sm = np.random.random((n, n))
sm = (sm + sm.T) / 2
sm[range(n), range(n)] = 1
return sm
def argsort_sim_mat(sm):
idx = [np.argmax(np.sum(sm, axis=1))] # a
for i in range(1, len(sm)):
sm_i = sm[idx[-1]].copy()
sm_i[idx] = -1
idx.append(np.argmax(sm_i)) # b
return np.array(idx)
n = 10
sim_mat = create_dummy_sim_mat(n=n)
idx = argsort_sim_mat(sim_mat)
sim_mat2 = sim_mat[idx, :][:, idx] # apply reordering for rows and columns
# Plot results
fig, ax = plt.subplots(1, 2)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat2)
def ticks(_ax, ti, la):
_ax.set_xticks(ti)
_ax.set_yticks(ti)
_ax.set_xticklabels(la)
_ax.set_yticklabels(la)
ticks(_ax=ax[0], ti=range(n), la=range(n))
ticks(_ax=ax[1], ti=range(n), la=idx)
After meTchaikovsky's answer I also tested my idea on a clustered similarity matrix (see first image) this method works but is not perfect (see second image).
Because I use the similarity between two elements as approximation to their similarity to all other elements, it is quite clear why this does not work perfectly.
So instead of using the initial similarity to sort the elements one could calculate a second order similarity matrix which measures how similar the similarities are (sorry).
This measure describes better what you are interested in. If two rows / columns have similar colors they should be close to each other. The algorithm to sort the matrix is the same as before
def add_cluster(sm, c=3):
idx_cluster = np.array_split(np.random.permutation(np.arange(len(sm))), c)
for ic in idx_cluster:
cluster_noise = np.random.uniform(0.9, 1.0, (len(ic),)*2)
sm[ic[np.newaxis, :], ic[:, np.newaxis]] = cluster_noise
def get_sim_mat2(sm):
return 1 / (np.linalg.norm(sm[:, np.newaxis] - sm[np.newaxis], axis=-1) + 1/n)
sim_mat = create_dummy_sim_mat(n=100)
add_cluster(sim_mat, c=4)
sim_mat2 = get_sim_mat2(sim_mat)
idx = argsort_sim_mat(sim_mat)
idx2 = argsort_sim_mat(sim_mat2)
sim_mat_sorted = sim_mat[idx, :][:, idx]
sim_mat_sorted2 = sim_mat[idx2, :][:, idx2]
# Plot results
fig, ax = plt.subplots(1, 3)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(sim_mat_sorted2)
The results with this second method are quite good (see third image)
but I guess there exist cases where this approach also fails, so I would be happy about feedback.
Edit
I tried to explain it and did also link the ideas to the code with [a] and [b], but obviously I did not do a good job, so here is a second more verbose explanation.
You have n elements and a n x n similarity matrix sm where each cell (i, j) describes how similar element i is to element j. The goal is to order the rows / columns in such a way that one can see existing patterns in the similarity matrix. My idea to achieve this is really simple.
You start with an empty list and add elements one by one. The criterion for the next element is the similarity to the current element. If element i was added in the last step, I chose the element argmax(sm[i, :]) as next, ignoring the elements already added to the list. I ignore the elements by setting the values of those elements to -1.
You can use the function ticks to reorder the labels:
labels = np.array(labels) # make labels an numpy array, to index it with a list
ticks(_ax=ax[0], ti=range(n), la=labels[idx])
#scleronomic's solution is very elegant, but it also has one shortage, which is we cannot set the number of clusters in the sorted correlation matrix. Assume we are working with a set of variables, in which some of them are weakly correlated
import string
import numpy as np
import pandas as pd
n_variables = 20
n_clusters = 10
n_samples = 100
np.random.seed(100)
names = list(string.ascii_lowercase)[:n_variables]
belongs_to_cluster = np.random.randint(0,n_clusters,n_variables)
latent = np.random.randn(n_clusters,n_samples)
variables = np.random.rand(n_variables,n_samples)
for ind in range(n_clusters):
mask = belongs_to_cluster == ind
# weakening the correlation
if ind % 2 == 0:variables[mask] += latent[ind]*0.1
variables[mask] += latent[ind]
df = pd.DataFrame({key:val for key,val in zip(names,variables)})
corr_mat = np.array(df.corr())
As you can see, there are 10 clusters of variables by construction, however, variables within clusters that has an even index are weakly correlated. If we only want to see roughly 5 clusters in the sorted correlation matrix, maybe we need to find another way.
Based on this post, which is the accepted answer to the question "Clustering a correlation matrix", to sort a correlation matrix into blocks, what we need to find are blocks, where correlations within blocks are high and correlations between blocks are low. However, the solution provided by this accepted answer works best when we know how many blocks are there in the first place, and more importantly, the sizes of the underlying blocks are the same, or at least similar. Therefore, I improved the solution with a new function sort_corr_mat
def sort_corr_mat(corr_mat,clusters_guess):
def _swap_rows(corr_mat, var1, var2):
rs = corr_mat.copy()
rs[var2, :],rs[var1, :]= corr_mat[var1, :],corr_mat[var2, :]
cs = rs.copy()
cs[:, var2],cs[:, var1] = rs[:, var1],rs[:, var2]
return cs
# analysis
max_iter = 500
best_score,current_score,best_count = -1e8,-1e8,0
num_minimua_to_visit = 20
best_corr = corr_mat
best_ordering = np.arange(n_variables)
for i in range(max_iter):
for row1 in range(n_variables):
for row2 in range(n_variables):
if row1 == row2: continue
option_ordering = best_ordering.copy()
option_ordering[row1],option_ordering[row2] = best_ordering[row2],best_ordering[row1]
option_corr = _swap_rows(best_corr,row1,row2)
option_score = score(option_corr,n_variables,clusters_guess)
if option_score > best_score:
best_corr = option_corr
best_ordering = option_ordering
best_score = option_score
if best_score > current_score:
best_count += 1
current_corr = best_corr
current_ordering = best_ordering
current_score = best_score
if best_count >= num_minimua_to_visit:
return best_corr#,best_ordering
return best_corr#,best_ordering
With this function and the corr_mat constructed in the first place, I compared the result obtained with my function (on the right) with that obtained with #scleronomic's solution (in the middle)
sim_mat_sorted = corr_mat[argsort_sim_mat(corr_mat), :][:, argsort_sim_mat(corr_mat)]
corr_mat_sorted = sort_corr_mat(corr_mat,clusters_guess=5)
# Plot results
fig, ax = plt.subplots(1,3,figsize=(18,6))
ax[0].imshow(corr_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(corr_mat_sorted)
Clearly, #scleronomic's solution works much better and faster, but my solution offers more control to the pattern of the output.

TypeError for C dimension in matplotlib pcolormesh

I am grouping a dataframe via a certain column, and attempting to pcolormesh each grouped dataframe to its own subplot, ensuring that all subplots have the same bins.
My code is as follows:
abins, rbins = np.arange(0,25,1),np.arange(0,70,1)
A, R = np.meshgrid(np.deg2rad(15*abins),rbins)
fig, ax = plt.subplots(2,4,subplot_kw=dict(projection="polar"), figsize=(20,10),sharey=True)
k=0
for name, df in bigdf.groupby('sv'):
pc = ax.reshape(-1)[k].pcolormesh(A,R,df.groupby([pd.cut(df.el, rbins), pd.cut(df.mlt, abins)])['vtec'].mean().unstack(), \
cmap='RdBu_r')
k+=1
However, when I run I get the following error:
TypeError: Dimensions of C (58, 16) are incompatible with X (25) and/or Y (70); see help(pcolormesh)
I believe that there is something going on with A and R, since when I run
abins, rbins = np.arange(0,25,1),np.arange(0,70,1)
A, R = np.meshgrid(np.deg2rad(15*abins),rbins)
fig, ax = plt.subplots(2,4,subplot_kw=dict(projection="polar"), figsize=(20,10),sharey=True)
k=0
for name, df in bigdf.groupby('sv'):
pc = ax.reshape(-1)[k].pcolormesh(df.groupby([pd.cut(df.el, rbins), pd.cut(df.mlt, abins)])['vtec'].mean().unstack(), \
cmap='RdBu_r')
k+=1
without A and R, I get the following result:
but I require A and R so that I have my desired bins. Alternatively something could be going on in the pd.cut, but I do not know much about this function.
Any ideas? I can provide the dataframe if necessary, but for reproducability, column el, and mlt can be random values within rbins and abins, respectively. And vtec can just be a random set of values.
EDIT: Here is a subsection of the dataframe to reproduce the problem https://drive.google.com/open?id=1qEn6i2zU6sblY9kwfMPrDRY8rXS9i0Gj
I believe the error is occurring due to pd.cut() discarding desired bins when no values fall in them. Is there a way to stop it from doing this?
I managed to fix the issue using pd.MultiIndex.from_product and reindex to force the pd.groupby after pd.cut to keep empty bins, even if there are no data to populate them:
abins, rbins = np.arange(0,25,1),np.arange(0,70,1)
A, R = np.meshgrid(np.deg2rad(15*abins),rbins)
fig, ax = plt.subplots(2,4,subplot_kw=dict(projection="polar"), figsize=(20,10),sharey=True)
k=0
for name, df in bigdf.groupby('sv'):
idx = pd.MultiIndex.from_product([rbins, abins])
pc = ax.reshape(-1)[k].pcolormesh(A,R,df.groupby([pd.cut(df.el, rbins), pd.cut(df.mlt, abins)])['vtec'].mean().reindex(idx, fill_value=np.nan).unstack(), \
cmap='RdBu_r')
k+=1
Which gives the desired output:

matplotlib set stacked bar chart labels

I am trying to create a stacked bar chart that groups data by operating system. I'm having trouble creating an individual label for each component in each bar.
What I'm trying to do is different from the example in the docs because in my data each category appears in only one bar, whereas in the example each bar contains one member of each category.
Currently I have this code
plt.cla()
plt.clf()
plt.close()
def get_cmap(n, name='hsv'):
'''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name.'''
return plt.cm.get_cmap(name, n)
fig = plt.figure(figsize=(18, 10), dpi=80)
# group by the prefixes for now
prefixes = []
indices = []
bars = []
legend = {}
cmap = get_cmap(len(os_counts.index) + 1)
k = 0
for i, prefix in enumerate(d):
indices.append(i)
if len(d[prefix]["names"]) == 1:
prefixes.append(d[prefix]["names"][0])
else:
prefixes.append(prefix)
#colors = [next(cycol) for j in range(len(d[prefix]["names"]))]
colors = [cmap(k + j) for j in range(len(d[prefix]["names"]))]
k += len(colors)
bar = plt.bar([i] * len(d[prefix]["names"]), d[prefix]["values"], color=colors, label=d[prefix]["names"])
bars.append(bar)
plt.xticks(rotation=90)
plt.ylabel("Frequency")
plt.xlabel("Operating System")
plt.xticks(indices, prefixes)
plt.legend()
plt.show()
Which produces this result. As you can see, the legend is created for the first colour within the bar and shows an array.
I think that each call to plt.bar gets one label. So, you are giving it a list as a label for each plt.bar call. If you want a label for every color, representing every operating system then I think the solution is to call plt.bar once for each color or os.

how to update a matplotlib heatmap plot without creating a new window

I have matrix class that inherits from list. This class can display itself as a matplotlib heatmap representation of the matrix.
I'm trying to have the class written such that when I change values in the matrix, I can call the matrix's method plot() and it'll update the plot to reflect the matrix changes in the heatmap.
However, every time I run the method plot(), it creates a new heatmap in a new window instead of updating the existing plot. How could I get it simply to update the existing plot?
In the code below, there are three main parts: the main function shows how an instance of the matrix class is created, plotted and updated; the matrix class is basically a list object, with some minor functionality (including plotting) bolted on; the function plotList() is the function the matrix class calls in order to generate the plot object initially.
import time
import random
import matplotlib.pyplot as plt
plt.ion()
import numpy as np
def main():
print("plot 2 x 2 matrix and display it changing in a loop")
matrix = Matrix(
numberOfColumns = 2,
numberOfRows = 2,
randomise = True
)
# Plot the matrix.
matrix.plot()
# Change the matrix, redrawing it after each change.
for row in range(len(matrix)):
for column in range(len(matrix[row])):
input("Press Enter to continue.")
matrix[row][column] = 10
matrix.plot()
input("Press Enter to terminate.")
matrix.closePlot()
class Matrix(list):
def __init__(
self,
*args,
numberOfColumns = 3,
numberOfRows = 3,
element = 0.0,
randomise = False,
randomiseLimitLower = -0.2,
randomiseLimitUpper = 0.2
):
# list initialisation
super().__init__(self, *args)
self.numberOfColumns = numberOfColumns
self.numberOfRows = numberOfRows
self.element = element
self.randomise = randomise
self.randomiseLimitLower = randomiseLimitLower
self.randomiseLimitUpper = randomiseLimitUpper
# fill with default element
for column in range(self.numberOfColumns):
self.append([element] * self.numberOfRows)
# fill with pseudorandom elements
if self.randomise:
random.seed()
for row in range(self.numberOfRows):
for column in range(self.numberOfColumns):
self[row][column] = random.uniform(
self.randomiseLimitUpper,
self.randomiseLimitLower
)
# plot
self._plot = plotList(
list = self,
mode = "return"
)
# for display or redraw plot behaviour
self._plotShown = False
def plot(self):
# display or redraw plot
self._plot.draw()
if self._plotShown:
#self._plot = plotList(
# list = self,
# mode = "return"
# )
array = np.array(self)
fig, ax = plt.subplots()
heatmap = ax.pcolor(array, cmap = plt.cm.Blues)
self._plot.draw()
else:
self._plot.show()
self._plotShown = True
def closePlot(self):
self._plot.close()
def plotList(
list = list,
mode = "plot" # plot/return
):
# convert list to NumPy array
array = np.array(list)
# create axis labels
labelsColumn = []
labelsRow = []
for rowNumber in range(0, len(list)):
labelsRow.append(rowNumber + 1)
for columnNumber in range(0, len(list[rowNumber])):
labelsColumn.append(columnNumber)
fig, ax = plt.subplots()
heatmap = ax.pcolor(array, cmap = plt.cm.Blues)
# display plot or return plot object
if mode == "plot":
plt.show()
elif mode == "return":
return(plt)
else:
Exception
if __name__ == '__main__':
main()
I'm using Python 3 in Ubuntu.
The method plot(self) creates a new figure in the line fig, ax = plt.subplots(). To use an existing figure you can give your figure a number or name when it's first created in plotList():
fig = plt.figure('matrix figure')
ax = fig.add_subplot(111)
then use
plt.figure('matrix figure')
ax = gca() # gets current axes
to make that the active figure and axes. Alternately, you might want to the figure and axis created in plotList and pass them to plot.

Resources