Different clustering results on Azure databricks (cloud) vs Jupyter Notebook (local) with same seed - scikit-learn

I ran sklearn Kmeans clustering on a dataset with the same code and seed on both setup but why do I still get different clustering results?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import sklearn.cluster as cluster
import sklearn.metrics as metrics
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
df = pd.read_csv("sample_dataset.csv")
#return only unique records
df2 = df.drop_duplicates(subset=["ID"], keep = "first")
#remove outliers
df2[["AMT_INCOME_TOTAL", "DAYS_BIRTH"]] = df2[["AMT_INCOME_TOTAL", "DAYS_BIRTH"]].astype("float")
iqr = df2["AMT_INCOME_TOTAL"].quantile(q=0.75) - df2["AMT_INCOME_TOTAL"].quantile(q=0.25)
upperbound = df2["AMT_INCOME_TOTAL"].quantile(q=0.75) + 1.5 * iqr
#keep records lesser than upperbound
df2 = df2.where(df2["AMT_INCOME_TOTAL"] <= upperbound)
df2 = df2[df2["AMT_INCOME_TOTAL"].notna()]
df2["AGE"] = round(abs(df2["DAYS_BIRTH"]/365))
df3 = df2[["ID","AMT_INCOME_TOTAL","AGE"]]
#for simplicity, dod 2 cols only #amt income and age
#min max normalisation
scaler = MinMaxScaler()
scale = scaler.fit_transform(df3[["AMT_INCOME_TOTAL","AGE"]])
df_scale = pd.DataFrame(scale, columns=["AMT_INCOME_TOTAL","AGE"])
X= df_scale.values
#best K is 3
k_means_best = KMeans(n_clusters=3, init="k-means++", random_state=101)
y= k_means_best.fit_predict(X)
I tried with two local machines and both produce the same results but when tested on Azure databricks, the results are different.

Related

Annotating clustering from DBSCAN to original Pandas DataFrame

I have working code that is utilizing dbscan to find tight groups of sparse spatial data imported with pd.read_csv.
I am maintaining the original spatial data locations and would like to annotate the labels returned by dbscan for each data point to the original dataframe and then write a csv with the same information.
So the code below is doing exactly what I would expect it to at this point, I would just like to extend it to import the label for each row in the original dataframe.
import argparse
import string
import os, subprocess
import pathlib
import glob
import gzip
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.cluster import DBSCAN
X = pd.read_csv(tmp_csv_name)
X = X.drop('Name', axis = 1)
X = X.drop('Type', axis = 1)
X = X.drop('SomeValue', axis = 1)
# only columns 'x' and 'y' now remain
db=DBSCAN(eps=EPS, min_samples=minSamples, metric='euclidean', algorithm='auto', leaf_size=30).fit(X)
labels = def_inst_dbsc.labels_
unique_labels = set(labels)
# maxX , maxY are manual inputs temporarily
while sizeX > 16 or sizeY > 16 :
sizeX=sizeX*0.8 ; sizeY=sizeY*0.8
fig, ax = plt.subplots(figsize=(sizeX,sizeY))
plt.xlim(0,maxX)
plt.ylim(0,maxY)
plt.scatter(X['x'], X['y'], c=colors, marker="o", picker=True)
# hackX , hackY are manual inputs temporarily
# which represent the boundaries defined in the original dataset
poly = patches.Polygon(xy=list(zip(hackX,hackY)), fill=False)
ax.add_patch(poly)
plt.show()

clustering for a single timeseries

I have a single array numpy array(x) and i want to cluster it in unsupervised way using DBSCAN and hierarchial clustering using scikitlearn. Is the clustering possible for single array data? Additionally i need to plot the clusters and its corresponding representation on the input data.
I tried
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats
import scipy.cluster.hierarchy as hac
#my data
x = np.linspace(0, 500, 10000)
x = 1.5 * np.sin(x)
#dbscan
clustering = DBSCAN(eps=3).fit(x)
# here i am facing problem
# hierarchial
Yes, DBSCAN can cluster "1-D" arrays. See time series below, although I don't know the significance of clustering just the waveform.
For example,
import numpy as np
rng =np.random.default_rng(42)
x=rng.normal(loc=[-10,0,0,0,10], size=(200,5)).reshape(-1,1)
rng.shuffle(x)
print(x[:10])
# [[-10.54349551]
# [ -0.32626201]
# [ 0.22359555]
# [ -0.05841124]
# [ -0.11761086]
# [ -1.0824272 ]
# [ 0.43476607]
# [ 11.40382139]
# [ 0.70166365]
# [ 9.79889535]]
from sklearn.cluster import DBSCAN
dbs=DBSCAN()
clusters = dbs.fit_predict(x)
import matplotlib.pyplot as plt
plt.scatter(x,np.zeros(len(x)), c=clusters)
You can use AgglomerativeClustering for hierarchical clustering.
Here's an example using the data from above.
from sklearn.cluster import AgglomerativeClustering
aggC = AgglomerativeClustering(n_clusters=None, distance_threshold=1.0, linkage="single")
clusters = aggC.fit_predict(x)
plt.scatter(x,np.zeros(len(x)), c=clusters)
Time Series / Waveform (no other features)
You can do it, but with no features other than time and signal amplitude, I don't know if this has any meaning.
import numpy as np
from scipy import signal
y = np.hstack((np.zeros(100), signal.square(2*np.pi*np.linspace(0,2,200, endpoint=False)), np.zeros(100), signal.sawtooth(2*np.pi*np.linspace(0,2,200, endpoint=False)+np.pi/2,width=0.5), np.zeros(100), np.sin(2*np.pi*np.linspace(0,2,200,endpoint=False)), np.zeros(100)))
import datetime
start = datetime.datetime.fromisoformat("2022-12-01T12:00:00.000000")
times = np.array([(start+datetime.timedelta(microseconds=_)).timestamp() for _ in range(1000)])
my_sig = np.hstack((times.reshape(-1,1),y.reshape(-1,1)))
print(my_sig[:5,:])
# [[1.6698924e+09 0.0000000e+00]
# [1.6698924e+09 0.0000000e+00]
# [1.6698924e+09 0.0000000e+00]
# [1.6698924e+09 0.0000000e+00]
# [1.6698924e+09 0.0000000e+00]]
from sklearn.cluster import AgglomerativeClustering
aggC = AgglomerativeClustering(n_clusters=None, distance_threshold=4.0)
clusters = aggC.fit_predict(my_sig)
import matplotlib.pyplot as plt
plt.scatter(my_sig[:,0], my_sig[:,1], c=clusters)

Drop the features that have less correlation with respect to target variable

I have loaded a dataset and tried to find the correlation coefficient with respect to target variable.
Below are the codes:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#Loading the dataset
x = load_boston()
df = pd.DataFrame(x.data, columns = x.feature_names)
df["MEDV"] = x.target
X = df.drop("MEDV",1) #Feature Matrix
y = df["MEDV"] #Target Variable
df.head()
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
#Correlation with output variable
cor_target = abs(cor["MEDV"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.4]
print(relevant_features)
How do I drop the features that have correlation coefficient < 0.4?
Try this:
#Selecting least correlated features
irelevant_features = cor_target[cor_target<0.4]
# list of irelevant_features
cols = list([i for i in irelevant_features.index])
#Dropping irelevant_features
df = df.drop(cols, axis=1)
relevant_features = cor_target[cor_target < 0.4]
print(relevant_features)
X = df.drop(['MEDV','CRIM', 'ZN', 'CHAS','AGE', 'DIS','RAD', 'B'], 1)
use: for i in irelevant_features(As written above)

sklearn MinMaxScaler mis-scaling?

I'm having trouble understanding one of the scaled columns in a pandas dataframe returned by MinMaxScaler:
The code snippet is as follows:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
A = np.random.randint(5, size=(8, 4))
FrameA = pd.DataFrame()
FrameA = A
scaled_array = MinMaxScaler().fit_transform(FrameA)
Scaled (LHS) and original (RHS)
Column 2 is suspect. The formula seems to be: x[i] / max{x} - 1 which differs from the other columns.

Sklearn kmeans equivalent of elbow method

Let's say I'm examining up to 10 clusters, with scipy I usually generate the 'elbow' plot as follows:
from scipy import cluster
cluster_array = [cluster.vq.kmeans(my_matrix, i) for i in range(1,10)]
pyplot.plot([var for (cent,var) in cluster_array])
pyplot.show()
I have since became motivated to use sklearn for clustering, however I'm not sure how to create the array needed to plot as in the scipy case. My best guess was:
from sklearn.cluster import KMeans
km = [KMeans(n_clusters=i) for i range(1,10)]
cluster_array = [km[i].fit(my_matrix)]
That unfortunately resulted in an invalid command error. What is the best way sklearn way to go about this?
Thank you
you can use the inertia attribute of Kmeans class.
Assuming X is your dataset:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
X = # <your_data>
distorsions = []
for k in range(2, 20):
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
distorsions.append(kmeans.inertia_)
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 20), distorsions)
plt.grid(True)
plt.title('Elbow curve')
You had some syntax problems in the code. They should be fixed now:
Ks = range(1, 10)
km = [KMeans(n_clusters=i) for i in Ks]
score = [km[i].fit(my_matrix).score(my_matrix) for i in range(len(km))]
The fit method just returns a self object. In this line in the original code
cluster_array = [km[i].fit(my_matrix)]
the cluster_array would end up having the same contents as km.
You can use the score method to get the estimate for how well the clustering fits. To see the score for each cluster simply run plot(Ks, score).
You can also use euclidean distance between the each data with the cluster center distance to evaluate how many clusters to choose. Here is the code example.
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
iris = load_iris()
x = iris.data
res = list()
n_cluster = range(2,20)
for n in n_cluster:
kmeans = KMeans(n_clusters=n)
kmeans.fit(x)
res.append(np.average(np.min(cdist(x, kmeans.cluster_centers_, 'euclidean'), axis=1)))
plt.plot(n_cluster, res)
plt.title('elbow curve')
plt.show()

Resources