How do I implement mean shift by using a grid of centroids? - python-3.x

This is for a class and I would really appreciate your help! I made some changes based on a comment I received, but now I get another error..
I need to modify an existing function that implements the mean-shift algorithm, but instead of initializing all the points as the first set of centroids, the function creates a grid of centroids with the grid based on the radius. I also need to delete the centroids that don't contain any data points. My issue is that I don't understand how to fix the error I get!
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-7-de18ffed728f> in <module>()
49 centroids = initialize_centroids(x)
50
---> 51 new_centroids = update_centroids(x, centroids, r = 1)
52
53 print(len(centroids))
<ipython-input-7-de18ffed728f> in update_centroids(data, centroids, r)
26 #print(len(centroids))
27 #print(range(len(centroids)))
---> 28 centroid = centroids[i]
29 for data_point in data:
30 if np.linalg.norm(data_point - centroid) < r:
IndexError: index 2 is out of bounds for axis 0 with size 2
I tried using the range of the input dataset as boundaries for a grid, with the points separated by the radius.
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
def initialize_centroids(data, r = 1):
'''Creates a grid of centroids with grid based on radius'''
data = np.array(data)
xi,yi = min(range(len(data))), max(range(len(data)))
mx = np.arange(xi,yi,r)
x,y = np.meshgrid(mx,mx)
centroids=np.vstack([x.ravel(), y.ravel()])
return centroids
#update centroids based on mean of points that fall within a specified radius of each centroid
def update_centroids(data, centroids, r = 1):
new_centroids = []
for i in centroids:
in_radius = []
centroid = centroids[i] #this is where the error occurs
for data_point in data:
if np.linalg.norm(data_point - centroid) < radius:
in_radius.append(data_point) #this list is appended by adding the new centroid to it if the above conition is satisfied.
new_centroid = np.mean(in_radius, axis=0)
#maybe another way to do the next part
new_centroids.append(tuple(new_centroid))
unique_centroids = sorted(list(set(new_centroids))) #for element in in_radius, if element in set skip else set.append(element(in_rad)). append does not work with set.
new_centroids = {i:np.array(unique_centroids[i]) for i in range(len(unique_centroids))}
return new_centroids
#test function on:
x, y = datasets.make_blobs(n_samples=300, n_features = 2, centers=[[0, 7], [0, -7], [5,7], [5, 0]])
centroids = initialize_centroids(x)
new_centroids = update_centroids(x, centroids, radius = 2)
print(len(centroids))
print()
print(len(new_centroids))
#code for plotting initially:
plt.scatter(x[:,0], x[:,1], color = 'k')
for i in range(len(new_centroids)):
plt.scatter(new_centroids[i][0], new_centroids[i][1], s=200, color = 'r', marker = "*")
#code for plotting updated centroids:
new_centroids = update_centroids(x, new_centroids, radius = 2)
plt.scatter(x[:,0], x[:,1], color = 'k')
for i in range(len(new_centroids)):
plt.scatter(new_centroids[i][0], new_centroids[i][1], s=200, color = 'r', marker = "*")
#code for iterations:
def iterate_to_conv(data, max_iter=100):
centroids = initialize_centroids(data)
iter_count = 0
while iter_count <= max_iter:
new_centroids = update_centroids(data, centroids, radius = 2)
centroids = new_centroids
iter_count += 1
return centroids
centroids = iterate_to_conv(x)
plt.scatter(x[:,0], x[:,1], color = 'k')
for i in range(len(centroids)):
plt.scatter(centroids[i][0], centroids[i][1], s=200, color = 'r', marker = "*")
The function needs to return the number of final centroids. I haven't gotten ahead far enough to know how the entire implementation of mean-shift would work with this function..

When you are running that loop: for i in centroids the i that is iterated through centroids isn't a number, it is a vector which is why an error is pops up. For example, the first i value might be equal to [0 1 2 0 1 2 0 1 2]. So to take an index of that doesn't make sense. What your code is saying to do is to take centroid = centroid[n1 n2 nk]. To fix it, you really need to change how your initialize centroid function works. Meshgrid also won't create an N dimensional grid, so your meshgrid might work for 2 dimensions but not N. I hope that helps.

Related

4D chaotic system Lyapunov exponent

I am trying to work on the 4 dimensional chaotic attractor Lyapunov spectrum and there values so far the code mention below works well for three dimensional system but errors arise in 4D and 5D system
import matplotlib.pyplot as plt
import numpy as np
from scipy.integrate import odeint
def diff_Lorenz(u):
x,y,z,w= u
f = [a*(y-x) , x*z+w, b-x*y, z*y-c*w]
Df = [[-a,a,0,0], [z,0, x,1], [-y, -x, 0,0],[0,z,y,-c]]
return np.array(f), np.array(Df)
def LEC_system(u):
#x,y,z = u[:3]
U = u[2:18].reshape([4,4])
L = u[12:15]
f,Df = diff_Lorenz(u[:4])
A = U.T.dot(Df.dot(U))
dL = np.diag(A).copy();
for i in range(4):
A[i,i] = 0
for j in range(i+1,4): A[i,j] = -A[j,i]
dU = U.dot(A)
return np.concatenate([f,dU.flatten(),dL])
a=6;b=11;c=5;
u0 = np.ones(4)
U0 = np.identity(4)
L0 = np.zeros(4)
u0 = np.concatenate([u0, U0.flatten(), L0])
t = np.linspace(0,10,301)
u = odeint(lambda u,t:LEC_system(u),u0,t, hmax=0.05)
L = u[5:,12:15].T/t[5:]
# plt.plot(t[5:],L.T)
# plt.show()
p1=L[0,:];p2=L[1,:];p3=L[2,:];p4=L[3,:]
L1 = np.mean(L[0,:]);L2=np.average(L[1,:]);L3=np.average(L[2,:]);L4=np.average(L[3,:])
t1 = np.linspace(0,100,len(p1))
plt.plot(t1,p1);plt.plot(t1,p2);plt.plot(t1,p3);plt.plot(t1,p4)
# plt.show()
print('LES= ',L1,L2,L3,L4)
the output error is
D:\anaconda3\lib\site-packages\scipy\integrate\odepack.py:247: ODEintWarning: Excess work done on this call (perhaps wrong Dfun type). Run with full_output = 1 to get quantitative information.
warnings.warn(warning_msg, ODEintWarning)
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7008/1971199288.py in <module>
32 # plt.plot(t[5:],L.T)
33 # plt.show()
---> 34 p1=L[0,:];p2=L[1,:];p3=L[2,:];p4=L[3,:]
35 L1=np.mean(L[0,:]);L2=np.average(L[1,:]);L3=np.average(L[2,:]);L4=np.average(L[3,:])
36 t1 = np.linspace(0,100,len(p1))
IndexError: index 3 is out of bounds for axis 0 with size 3
what is wrong?
output expected is
L1=.5162,L2=-.0001,L3=-4.9208,L4=-6.5954
In LEC_system(u), the flat vector u contains in sequence
the state vector, n components,
the eigenbasis U, a n x n matrix
the accumulated exponents L, n components.
With n=4, this translates thus to the decomposition
def LEC_system(u):
#x,y,z,w = u[:4]
U = u[4:20].reshape([4,4])
L = u[20:24]
f,Df = diff_Lorenz(u[:4])
A = U.T.dot(Df.dot(U))
dL = np.diag(A).copy();
for i in range(4):
A[i,i] = 0
for j in range(i+1,4): A[i,j] = -A[j,i]
dU = U.dot(A)
return np.concatenate([f,dU.flatten(),dL])
Of course, in the evaluation after the integration one has to likewise use the correct segment of the state vector
L = u[5:,20:24].T/t[5:]
Then I get the plot
and only using the latter quart of the graphs, after integrating to t=60
LES= 0.029214865425355396 -0.43816854013111833 -4.309199339754925 -6.28183676249535
This still are not the expected values, as that appears to be contracting along all directions transversal to the solution curve.

mplcursors: show and highlight coordinates of nearby local extreme

I have code that shows the label for each point in a matplotlib scatterplot using mplcursors, similar to this example. I want to know how to, form a list of values, make a certain point stand out, as in if I have a graph of points y=-x^2. When I go near the peak, it shouldn't show 0.001, but 0 instead, without the trouble needing to find the exact mouse placement of the top. I can't solve for each point in the graph, as I don't have a specific function.
Supposing the points in the scatter plot are ordered, we can investigate whether an extreme in a nearby window is also an extreme in a somewhat larger window. If, so we can report that extreme with its x and y coordinates.
The code below only shows the annotation when we're close to a local maximum or minimum. It also temporarily shows a horizontal and vertical line to indicate the exact spot. The code can be a starting point for many variations.
import matplotlib.pyplot as plt
import mplcursors
import numpy as np
near_window = 10 # the width of the nearby window
far_window = 20 # the width of the far window
def show_annotation(sel):
ind = sel.target.index
near_start_index = max(0, ind - near_window)
y_near = y[near_start_index: min(N, ind + near_window)]
y_far = y[max(0, ind - far_window): min(N, ind + far_window)]
near_max = y_near.max()
far_max = y_far.max()
annotation_str = ''
if near_max == far_max:
near_argmax = y_near.argmax()
annotation_str = f'local max:\nx:{x[near_start_index + near_argmax]:.3f}\ny:{near_max:.3f}'
maxline = plt.axhline(near_max, color='crimson', ls=':')
maxline_x = plt.axvline(x[near_start_index+near_argmax], color='grey', ls=':')
sel.extras.append(maxline)
sel.extras.append(maxline_x)
else:
near_min = y_near.min()
far_min = y_far.min()
if near_min == far_min:
near_argmin = y_near.argmin()
annotation_str = f'local min:\nx:{x[near_start_index+near_argmin]:.3f}\ny:{near_min:.3f}'
minline = plt.axhline(near_min, color='limegreen', ls=':')
minline_x = plt.axvline(x[near_start_index + near_argmin], color='grey', ls=':')
sel.extras.append(minline)
sel.extras.append(minline_x)
if len(annotation_str) > 0:
sel.annotation.set_text(annotation_str)
else:
sel.annotation.set_visible(False) # hide the annotation
# sel.annotation.set_text(f'x:{sel.target[0]:.3f}\n y:{sel.target[1]:.3f}')
N = 500
x = np.linspace(0, 100, 500)
y = np.cumsum(np.random.normal(0, 0.1, N))
box = np.ones(20) / 20
y = np.convolve(y, box, mode='same')
scat = plt.scatter(x, y, s=1)
cursor = mplcursors.cursor(scat, hover=True)
cursor.connect('add', show_annotation)
plt.show()

Draw the borders of a binary Numpy array with Matplotlib

I'm using image-segmentation on some images, and sometimes it would be nice to be able to plot the borders of the segments.
I have a 2D NumPy array that I plot with Matplotlib, and the closest I've gotten, is using contour-plotting.
This makes corners in the array, but is otherwise perfect.
Can Matplotlib's contour-function be made to only plot vertical/horizontal lines, or is there some other way to do this?
An example can be seen here:
import matplotlib.pyplot as plt
import numpy as np
array = np.zeros((20, 20))
array[4:7, 3:8] = 1
array[4:7, 12:15] = 1
array[7:15, 7:15] = 1
array[12:14, 13:14] = 0
plt.imshow(array, cmap='binary')
plt.contour(array, levels=[0.5], colors='g')
plt.show()
I wrote some functions to achieve this some time ago, but I would be glad to figure out how it can be done quicker.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
def get_all_edges(bool_img):
"""
Get a list of all edges (where the value changes from True to False) in the 2D boolean image.
The returned array edges has he dimension (n, 2, 2).
Edge i connects the pixels edges[i, 0, :] and edges[i, 1, :].
Note that the indices of a pixel also denote the coordinates of its lower left corner.
"""
edges = []
ii, jj = np.nonzero(bool_img)
for i, j in zip(ii, jj):
# North
if j == bool_img.shape[1]-1 or not bool_img[i, j+1]:
edges.append(np.array([[i, j+1],
[i+1, j+1]]))
# East
if i == bool_img.shape[0]-1 or not bool_img[i+1, j]:
edges.append(np.array([[i+1, j],
[i+1, j+1]]))
# South
if j == 0 or not bool_img[i, j-1]:
edges.append(np.array([[i, j],
[i+1, j]]))
# West
if i == 0 or not bool_img[i-1, j]:
edges.append(np.array([[i, j],
[i, j+1]]))
if not edges:
return np.zeros((0, 2, 2))
else:
return np.array(edges)
def close_loop_edges(edges):
"""
Combine thee edges defined by 'get_all_edges' to closed loops around objects.
If there are multiple disconnected objects a list of closed loops is returned.
Note that it's expected that all the edges are part of exactly one loop (but not necessarily the same one).
"""
loop_list = []
while edges.size != 0:
loop = [edges[0, 0], edges[0, 1]] # Start with first edge
edges = np.delete(edges, 0, axis=0)
while edges.size != 0:
# Get next edge (=edge with common node)
ij = np.nonzero((edges == loop[-1]).all(axis=2))
if ij[0].size > 0:
i = ij[0][0]
j = ij[1][0]
else:
loop.append(loop[0])
# Uncomment to to make the start of the loop invisible when plotting
# loop.append(loop[1])
break
loop.append(edges[i, (j + 1) % 2, :])
edges = np.delete(edges, i, axis=0)
loop_list.append(np.array(loop))
return loop_list
def plot_outlines(bool_img, ax=None, **kwargs):
if ax is None:
ax = plt.gca()
edges = get_all_edges(bool_img=bool_img)
edges = edges - 0.5 # convert indices to coordinates; TODO adjust according to image extent
outlines = close_loop_edges(edges=edges)
cl = LineCollection(outlines, **kwargs)
ax.add_collection(cl)
array = np.zeros((20, 20))
array[4:7, 3:8] = 1
array[4:7, 12:15] = 1
array[7:15, 7:15] = 1
array[12:14, 13:14] = 0
plt.figure()
plt.imshow(array, cmap='binary')
plot_outlines(array.T, lw=5, color='r')

How to reduce the number of boxes/regions created in MSER

I have been trying to get less boxes with MSER since I have too many boxes created on the same element repeatedly with very little pixel differences. My code is as below:
## Get mser, and set parameters
_delta = 10
_min_area = 250
_max_area = 800
_max_variation = 10.0
_min_diversity = 30.0
_max_evolution = 10
_area_threshold = 12.0
_min_margin = 2.9
_edge_blur_size = 3
mser = cv2.MSER_create(_delta,_min_area, _max_area, _max_variation,
_min_diversity,_max_evolution, _area_threshold, _min_margin, _edge_blur_size)
and then
## Do mser detection, get the coodinates and bboxes on the original image
gray = cv2.cvtColor(final, cv2.COLOR_BGR2GRAY)
coordinates, bboxes = mser.detectRegions(gray)
After this , I see there are 26K boxes created. Which amongst the parameters can be tuned for lesser number of regions(since they are overlapping a lot). Kindly help?
_delta is the most important parameter for reducing the number of boxes. Try raising it to 25. The higher the _delta the less blobs you will get.
_min_area - The smallest blob
_max_area - The largest blob
_min_diversity - Raise to reduce the number of overlapping blobs
_max_variation - Raise to reduce areas with high variance
For more information
After that I would checking the bboxes to filter out over lapping blobs
Code Example
import cv2
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
img = cv.imread('input_img.png')
iou_th = 0.95
mser = cv2.MSER_create(_delta=10, _min_area=1000, _max_area=int(0.1 * np.pi * (img.shape[0] /2)**2), _max_variation=0.1)
regions, bboxes = mser.detectRegions(img)
hulls = [cv2.convexHull(p.reshape(-1, 1, 2)) for p in regions]
# Debug plot
img_ = img.copy()
cv2.polylines(img_, hulls, 1, (255, 0, 0), thickness=1)
fig, ax = plt.subplots(figsize=(10, 6))
ax.imshow(img_)
ax.set_title('MSER with overlapping regions')
size_dict = {k: len(region) for k, region in enumerate(regions)}
# Cull overlapping blobs
graph = nx.Graph()
graph.add_nodes_from(range(len(hulls)))
for i, cnt in enumerate(hulls):
for j, cnt in enumerate(hulls):
if i >= j:
continue
box_i = bboxes[i]
box_j = bboxes[j]
tl_i = box_i[:2]
tl_j = box_j[:2]
br_i = tl_i + box_i[2:]
br_j = tl_j + box_j[2:]
tl = np.maximum(tl_i, tl_j)
br = np.minimum(br_i, br_j)
intersected_rect = br - tl
intersection = np.prod(intersected_rect) if intersected_rect[0] > 0 and intersected_rect[1] > 0 else 0
union = np.prod(box_i[2:]) + np.prod(box_j[2:]) - intersection
iou = intersection / union
if iou > iou_th:
graph.add_edge(i, j, iou=iou)
# make list of unique regions - pick the smallest region
trees = list(nx.connected_component_subgraphs(graph))
unique_blobs = []
for tree in trees:
# Choose the smallest region
smallest_idx = None
smallest_blob = np.inf
for node in tree.nodes():
if size_dict[node] < smallest_blob:
smallest_blob = size_dict[node]
smallest_idx = node
unique_blobs.append(smallest_idx)
unique_blobs = unique_blobs
hulls = [hulls[k] for k in unique_blobs]
regions = [regions[k] for k in unique_blobs]
bboxes = [bboxes[k] for k in unique_blobs]
size_dict = {k: len(region) for k, region in enumerate(regions)}
# debug plot
img_ = img.copy()
cv2.polylines(img_, hulls, 1, (255, 0, 0), thickness=1)
fig, ax = plt.subplots(figsize=(10, 6))
ax.imshow(img_)
ax.set_title('MSER with unique regions')

K means with a condition

I want to apply K means ( or any other simple clustering algorithm ) to data with two variables, but i want clusters to respect a condition : the sum of a third variable per cluster > some_value.
Is that possible?
Notations :
- K is the number of clusters
- let's say that the first two variables are point coordinnates (x,y)
- V denotes the third variable
- Ci : the sum of V over each cluster i
- S the total sum (sum Ci)
- and the threshold T
Problem definition :
From what I understood, the aim is to run an algorithm that keeps the spirit of kmeans while respecting the constraints.
Task 1 - group points by proximity to centroids [kmeans]
Task 2 - for each cluster i, Ci > T* [constraint]
Regular kmeans limitation for the constraint problem :
A regular kmeans, assign points to centroids by taking them in arbitrary order. In our case, this will lead to uncontrol growth of the Ci while adding points.
For exemple, with K=2, T=40 and 4 points with the third variables equal to V1=50, V2=1, V3=50, V4=50.
Suppose also that point P1, P3, P4 are closer to centroid 1. Point P2 is closer to centroid 2.
Let's run the assignement step of a regular kmeans and keep track of Ci :
1-- take point P1, assign it to cluster 1. C1=50 > T
2-- take point P2, assign it to cluster 2 C2=1
3-- take point P3, assign it to cluster 1. C1=100 > T => C1 grows too much !
4-- take point P4, assign it to cluster 1. C1=150 > T => !!!
Modified kmeans :
In the previous, we want to prevent C1 from growing too much and help C2 grow.
This is like pouring champagne into several glasses : if you see a glass with less champaigne, you go and fill it. You do that because you have constraints : limited amound of champaigne (S is bounded) and because you want every glass to have enough champaign (Ci>T).
Of course this is just a analogy. Our modified kmeans will add new poins to the cluster with minimal Ci until the constraint is achieved (Task2). Now in which order should we add points ? By proximity to centroids (Task1). After all constraints are achieved for all cluster i, we can just run a regular kmeans on remaining unassigned points.
Implementation :
Next, I give a python implementation of the modified algorithm. Figure 1 displays a reprensentation of the third variable using transparency for vizualizing large VS low values. Figure 2 displays the evolution clusters using color.
You can play with the accept_thresh parameter. In particular, note that :
For accept_thresh=0 => regular kmeans (constraint is reached immediately)
For accept_thresh = third_var.sum().sum() / (2*K), you might observe that some points that closer to a given centroid are affected to another one for constraint reasons.
CODE :
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import time
nb_samples = 1000
K = 3 # for demo purpose, used to generate cloud points
c_std = 1.2
# Generate test samples :
points, classes = datasets.make_blobs(n_features=2, n_samples=nb_samples, \
centers=K, cluster_std=c_std)
third_var_distribution = 'cubic_bycluster' # 'uniform'
if third_var_distribution == 'uniform':
third_var = np.random.random((nb_samples))
elif third_var_distribution == 'linear_bycluster':
third_var = np.random.random((nb_samples))
third_var = third_var * classes
elif third_var_distribution == 'cubic_bycluster':
third_var = np.random.random((nb_samples))
third_var = third_var * classes
# Threshold parameters :
# Try with K=3 and :
# T = K => one cluster reach cosntraint, two clusters won't converge
# T = 2K =>
accept_thresh = third_var.sum().sum() / (2*K)
def dist2centroids(points, centroids):
'''return arrays of ordered points to each centroids
first array is index of points
second array is distance to centroid
dim 0 : centroid
dim 1 : distance or point index
'''
dist = np.sqrt(((points - centroids[:, np.newaxis]) ** 2).sum(axis=2))
ord_dist_indices = np.argsort(dist, axis=1)
ord_dist_indices = ord_dist_indices.transpose()
dist = dist.transpose()
return ord_dist_indices, dist
def assign_points_with_constraints(inds, dists, tv, accept_thresh):
assigned = [False] * nb_samples
assignements = np.ones(nb_samples, dtype=int) * (-1)
cumul_third_var = np.zeros(K, dtype=float)
current_inds = np.zeros(K, dtype=int)
max_round = nb_samples * K
for round in range(0, max_round): # we'll break anyway
# worst advanced cluster in terms of cumulated third_var :
cluster = np.argmin(cumul_third_var)
if cumul_third_var[cluster] > accept_thresh:
continue # cluster had enough samples
while current_inds[cluster] < nb_samples:
# add points to increase cumulated third_var on this cluster
i_inds = current_inds[cluster]
closest_pt_index = inds[i_inds][cluster]
if assigned[closest_pt_index] == True:
current_inds[cluster] += 1
continue # pt already assigned to a cluster
assignements[closest_pt_index] = cluster
cumul_third_var[cluster] += tv[closest_pt_index]
assigned[closest_pt_index] = True
current_inds[cluster] += 1
new_cluster = np.argmin(cumul_third_var)
if new_cluster != cluster:
break
return assignements, cumul_third_var
def assign_points_with_kmeans(points, centroids, assignements):
new_assignements = np.array(assignements, copy=True)
count = -1
for asg in assignements:
count += 1
if asg > -1:
continue
pt = points[count, :]
distances = np.sqrt(((pt - centroids) ** 2).sum(axis=1))
centroid = np.argmin(distances)
new_assignements[count] = centroid
return new_assignements
def move_centroids(points, labels):
centroids = np.zeros((K, 2), dtype=float)
for k in range(0, K):
centroids[k] = points[assignements == k].mean(axis=0)
return centroids
rgba_colors = np.zeros((third_var.size, 4))
rgba_colors[:, 0] = 1.0
rgba_colors[:, 3] = 0.1 + (third_var / max(third_var))/1.12
plt.figure(1, figsize=(14, 14))
plt.title("Three blobs", fontsize='small')
plt.scatter(points[:, 0], points[:, 1], marker='o', c=rgba_colors)
# Initialize centroids
centroids = np.random.random((K, 2)) * 10
plt.scatter(centroids[:, 0], centroids[:, 1], marker='X', color='red')
# Step 1 : order points by distance to centroid :
inds, dists = dist2centroids(points, centroids)
# Check if clustering is theoriticaly possible :
tv_sum = third_var.sum()
tv_max = third_var.max()
if (tv_max > 1 / 3 * tv_sum):
print("No solution to the clustering problem !\n")
print("For one point : third variable is too high.")
sys.exit(0)
stop_criter_eps = 0.001
epsilon = 100000
prev_cumdist = 100000
plt.figure(2, figsize=(14, 14))
ln, = plt.plot([])
plt.ion()
plt.show()
while epsilon > stop_criter_eps:
# Modified kmeans assignment :
assignements, cumul_third_var = assign_points_with_constraints(inds, dists, third_var, accept_thresh)
# Kmeans on remaining points :
assignements = assign_points_with_kmeans(points, centroids, assignements)
centroids = move_centroids(points, assignements)
inds, dists = dist2centroids(points, centroids)
epsilon = np.abs(prev_cumdist - dists.sum().sum())
print("Delta on error :", epsilon)
prev_cumdist = dists.sum().sum()
plt.clf()
plt.title("Current Assignements", fontsize='small')
plt.scatter(points[:, 0], points[:, 1], marker='o', c=assignements)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='o', color='red', linewidths=10)
plt.text(0,0,"THRESHOLD T = "+str(accept_thresh), va='top', ha='left', color="red", fontsize='x-large')
for k in range(0, K):
plt.text(centroids[k, 0], centroids[k, 1] + 0.7, "Ci = "+str(cumul_third_var[k]))
plt.show()
plt.pause(1)
Improvements :
- use the distribution of the third variable for assignments.
- manage divergence of the algorithm
- better initialization (kmeans++)
One way to handle this would be to filter the data before clustering.
>>> cluster_data = df.loc[df['third_variable'] > some_value]
>>> from sklearn.cluster import KMeans
>>> y_pred = KMeans(n_clusters=2).fit_predict(cluster_data)
If by sum you mean the sum of the third variable per cluster then you could use RandomSearchCV to find hyperparameters of KMeans that do or do not meet the condition.
K-means itself is an optimization problem.
Your additional constraint is a rather common optimization constraint, too.
So I'd rather approach this with an optimization solver.

Resources