Local Outlier Factor only calculated for some points (scikitLearn) - python-3.x

I have a large csv file, containing 2 columns representing the result of k-means clustering. I calculated 11 centroids, and the csv-file contains which one is the closest and which distance the point has towards this centroid.
The entries look like:
K11-closest,K11-distance
0,31544.821603570384
0,31494.23348984612
0,31766.471900874752
0,31710.896696452823
Then I want to calculate and plot the LOF using a script I found on scikit-learn.org
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
dataset = pd.read_csv('0.csv')
clf = LocalOutlierFactor(n_neighbors=20)
# use fit_predict to compute the predicted labels of the training samples
# (when LOF is used for outlier detection, the estimator has no predict,
# decision_function and score_samples methods).
y_pred = clf.fit_predict(dataset)
X_scores = clf.negative_outlier_factor_
plt.title("Local Outlier Factor (LOF)")
plt.scatter(dataset.iloc[:, 0], dataset.iloc[:, 1], color='k', s=3., label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(dataset.iloc[:, 0].values, dataset.iloc[:, 1].values, s=50 * radius, edgecolors='r',
facecolors='none', label='Outlier scores')
plt.show()
But the plot shows:
With black points being the date points and red is a circle, showing how much it is an outlier
So I assume the LOF is not calculated for every point. But why? And how I calculate it for every point? And make it visible in the plot

normalising the data will help you in making more visible graphs and as per your code you have taken multipier of radius as 50 and I have taken 1000.
As we can see the algorithm does not mark red circle for every data point and it also depends on nearest neighbours(n_neighbors) we are taking into account fro algo to mark the circles.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dataset = pd.DataFrame(data=[[0, 31544.821603570384], [0,31494.23348984612], \
[0,31766.471900874752], [0,31710.896696452823]], \
columns=["K11-closest","K11-distance"])
dataset = scaler.fit_transform(dataset)
clf = LocalOutlierFactor(n_neighbors=3)
y_pred = clf.fit_predict(dataset)
X_scores = clf.negative_outlier_factor_
plt.title("Local Outlier Factor (LOF)")
plt.scatter(dataset[:, 0], dataset[:, 1], color='k', s=3., label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(dataset[:, 0], dataset[:, 1], s=1000 * radius, edgecolors='r',
facecolors='none', label='Outlier scores')
legend = plt.legend(loc='upper left')
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()

Related

Can Principal Component Analysis be applied on 2D geometry with X and Y nodes in Python?

Aim of the task: I have sets of coordinates (X and Y) coordinates of the geometry and I want to make my geometry aligned. The coordinate and respective geometry is shown in the picture.
X1_coordinate = [0.0, 0.87, 1.37, 1.87, 2.73, 3.6, 4.46, 4.96, 5.46, 4.6, 3.73, 2.87, 2.0, 1.5, 1.0, 0.5, 2.37, 3.23, 4.1]
Y1_coordinate = [0.0, 0.5, -0.37, -1.23, -0.73, -0.23, 0.27, -0.6, -1.46, -1.96, -2.46, -2.96, -3.46, -2.6, -1.73, -0.87, -2.1, -1.6, -1.1]
Question: Can I apply Principal Component Analysis on 2D geometries to make it aligned such that its principal axis will be parallel to the reference axis (X and Y)?
Expected output: I want my geometry like this. This is just an example. I want my geometry in such as way that, principal axis of geometry lies on the reference axis or be parallel to reference axis.
What I tried: I tried below code to implement PCA and to obtain the geometry aligned.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
plt.style.use('ggplot')
# Load the data
# iris = datasets.load_iris()
X = X1_coordinate
y = Y1_coordinate
# Z-score the features
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
# The PCA model
pca = PCA(n_components=2) # estimate only 2 PCs
X_new = pca.fit_transform(X) # project the original data into the PCA space
However, after running the code, I got error as mentioned below.
Kindly let me know what should I do to make my geometry aligned. Looking forward to get answers.
Basically, you can apply PCA to this task.
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
circle_pts = sklearn.datasets.make_circles() #get two circles with labels
circle_pts = circle_pts[0][circle_pts[1]==0] #leave only one circle
ang = 63/180*np.pi #radians of rotation
R = np.array([[np.cos(ang), -np.sin(ang)], [np.sin(ang), np.cos(ang)]])
ellipse_pts = circle_pts
ellipse_pts[:,0] *= 4.5
ellipse_rot_pts = ellipse_pts # R.T
plt.figure()
plt.scatter(ellipse_rot_pts[:,0], ellipse_rot_pts[:,1])
plt.axis("equal")
plt.tight_layout()
plt.show()
scaler = StandardScaler(with_std=False)
scaler.fit(ellipse_rot_pts)
X = scaler.transform(ellipse_rot_pts)
pca = PCA(n_components=2) # estimate only 2 PCs
X_new = pca.fit_transform(X) # project the original data into the PCA space
plt.figure()
plt.scatter(X[:,0],X[:,1])
singular_values = pca.singular_values_
plt.plot([0, singular_values[0]*pca.components_[0,0]], [0, singular_values[0]*pca.components_[0,1]])
plt.plot([0, singular_values[1]*pca.components_[1,0]], [0, singular_values[1]*pca.components_[1,1]])
plt.axis("equal")
plt.show()
plt.figure()
plt.title("Aligned with axis figure")
plt.scatter(X_new[:,0],X_new[:,1])
plt.axis("equal")
plt.show()
But the problem is that not every geometry is appropriate for this. ellipse has two main axis of symmetry. Your figure for example doesn't. So principal components that are been seeking via maximum variance in data doesn't correspond with your example(expected output) axis alignement.
For example your set of points correspond to this variant of components alignment:
Your geometry
And for a modificated little more symmetric object.
A little more symmetrical figure
Hope i helped

How reduce the scale of a scatter plot with row coordinates to merge it with a circle of correlations to make a bibplot?

I have a dataset composed of data with the same unit of measurement. Before making my pca, I centered my data using sklearn.preprocessing.StandardScaler(with_std=False).
I don't understand why but using the sklearn.decomposition.PCA.fit_transform(<my_dataframe>) method when I want to display a correlation circle I get two perfectly represented orthogonal variables, thus indicating that they are independent, but they are not. With a correlation matrix I observe perfectly that they are anti-correlated.
Through dint of research I came across the "prince" package which manages to get the perfect coordinates of my centered but unscaled variables.
When I do my pca with it, I can perfectly display the projection of my lines. It also has the advantage of being able to display ellipses. The only problem is that there is no function for a bibplot.
I managed to display a circle of correlations using the column_correlations() method to get the coordinates of the variables. By tinkering here is what I managed to get:
When I try to put my two graphs together to form a biplot, my scatter plot is displayed in a scale that is way too large compared to the correlation circle.
I would just like to merge the two charts together using this package.
Here is the code that allowed me to get the graph showing row principal coordinates:
Note: In order to propose a model to reproduce I use the iris dataset, resembling in form to my dataset.
import pandas as pd
import prince
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pd.read_csv(url, names=names)
dataset = dataset.set_index('Class')
sc = StandardScaler(with_std=False)
dataset = pd.DataFrame(sc.fit_transform(dataset),
index=dataset.index,
columns=dataset.columns)
prince_pca = prince.PCA(n_components=2,
n_iter=3,
rescale_with_mean=True,
rescale_with_std=False,
copy=True,
check_input=True,
engine='auto',
random_state=42)
prince_pca = prince_pca.fit(dataset)
ax = prince_pca.plot_row_coordinates(dataset,
ax=None,
figsize=(10, 10),
x_component=0,
y_component=1,
labels=None,
color_labels=dataset.index,
ellipse_outline=True,
ellipse_fill=True,
show_points=True)
plt.show()
Here's the one I tinkered with to get my circle of correlations:
pcs = prince_pca.column_correlations(dataset)
pcs_0=pcs[0].to_numpy()
pcs_1=pcs[1].to_numpy()
pcs_coord = np.concatenate((pcs_0, pcs_1))
fig = plt.subplots(figsize=(10,10))
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.quiver(np.zeros(pcs_0.shape[0]), np.zeros(pcs_1.shape[0]),
pcs_coord[:4], pcs_coord[4:], angles='xy', scale_units='xy', scale=1, color='r', width= 0.003)
for i, (x, y) in enumerate(zip(pcs_coord[:4], pcs_coord[4:])):
plt.text(x, y, pcs.index[i], fontsize=12)
circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
plt.gca().add_artist(circle)
plt.plot([-1,1],[0,0],color='silver',linestyle='--',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='--',linewidth=1)
plt.title("Correlation circle of variable", fontsize=22)
plt.xlabel('F{} ({}%)'.format(1, round(100*prince_pca.explained_inertia_[0],1)),
fontsize=14)
plt.ylabel('F{} ({}%)'.format(2, round(100*prince_pca.explained_inertia_[1],1)),
fontsize=14)
plt.show()
And finally here is the one that tries to bring together the circle of correlations as well as the main row coordinates graph from the "prince" package:
pcs = prince_pca.column_correlations(dataset)
pcs_0 = pcs[0].to_numpy()
pcs_1 = pcs[1].to_numpy()
pcs_coord = np.concatenate((pcs_0, pcs_1))
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, aspect="equal")
plt.xlim(-1, 1)
plt.ylim(-1, 1)
plt.quiver(np.zeros(pcs_0.shape[0]),
np.zeros(pcs_1.shape[0]),
pcs_coord[:4],
pcs_coord[4:],
angles='xy',
scale_units='xy',
scale=1,
color='r',
width=0.003)
for i, (x, y) in enumerate(zip(pcs_coord[:4], pcs_coord[4:])):
plt.text(x, y, pcs.index[i], fontsize=12)
plt.scatter(
x=prince_pca.row_coordinates(dataset)[0],
y=prince_pca.row_coordinates(dataset)[1])
circle = plt.Circle((0, 0), 1, facecolor='none', edgecolor='b')
plt.gca().add_artist(circle)
plt.plot([-1, 1], [0, 0], color='silver', linestyle='--', linewidth=1)
plt.plot([0, 0], [-1, 1], color='silver', linestyle='--', linewidth=1)
plt.title("Correlation circle of variable", fontsize=22)
plt.xlabel('F{} ({}%)'.format(1,
round(100 * prince_pca.explained_inertia_[0],
1)),
fontsize=14)
plt.ylabel('F{} ({}%)'.format(2,
round(100 * prince_pca.explained_inertia_[1],
1)),
fontsize=14)
plt.show()
Bonus question: how to explain that the PCA class of sklearn does not calculate the correct coordinates for my variables when they are centered but not scaled? Any method to overcome this?
Here is the circle of correlations obtained by creating the pca object with sklearn where the "length" and "margin_low" variables appear as orthogonal:
Here is the correlation matrix demonstrating the negative correlation between the "length" and "margin_low" variables:
I managed to mix the two graphs.
Here is the code to display the graph combining the circle of correlations and the scatter with the rows:
import pandas as pd
import prince
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
# Import dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# Preparing the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pd.read_csv(url, names=names)
dataset = dataset.set_index('Class')
# Preprocessing: centered but not scaled
sc = StandardScaler(with_std=False)
dataset = pd.DataFrame(sc.fit_transform(dataset),
index=dataset.index,
columns=dataset.columns)
# PCA setting
prince_pca = prince.PCA(n_components=2,
n_iter=3,
rescale_with_mean=True,
rescale_with_std=False,
copy=True,
check_input=True,
engine='auto',
random_state=42)
# PCA fiting
prince_pca = prince_pca.fit(dataset)
# Component coordinates
pcs = prince_pca.column_correlations(dataset)
# Row coordinates
pca_row_coord = prince_pca.row_coordinates(dataset).to_numpy()
# Preparing the colors for parameter 'c'
colors = dataset.T
# Display row coordinates
ax = prince_pca.plot_row_coordinates(dataset,
figsize=(12, 12),
x_component=0,
y_component=1,
labels=None,
color_labels=dataset.index,
ellipse_outline=True,
ellipse_fill=True,
show_points=True)
# We plot the vectors
plt.quiver(np.zeros(pcs.to_numpy().shape[0]),
np.zeros(pcs.to_numpy().shape[0]),
pcs[0],
pcs[1],
angles='xy',
scale_units='xy',
scale=1,
color='r',
width=0.003)
# Display the names of the variables
for i, (x, y) in enumerate(zip(pcs[0], pcs[1])):
if x >= xmin and x <= xmax and y >= ymin and y <= ymax:
plt.text(x,
y,
prince_pca.column_correlations(dataset).index[i],
fontsize=16,
ha="center",
va="bottom",
color="red")
# Display a circle
circle = plt.Circle((0, 0),
1,
facecolor='none',
edgecolor='orange',
linewidth=1)
plt.gca().add_artist(circle)
# Title
plt.title("Row principal coordinates and circle of correlations", fontsize=22)
# Display the percentage of inertia on each axis
plt.xlabel('F{} ({}%)'.format(1,
round(100 * prince_pca.explained_inertia_[0],
1)),
fontsize=14)
plt.ylabel('F{} ({}%)'.format(2,
round(100 * prince_pca.explained_inertia_[1],
1)),
fontsize=14)
# Display the grid to better read the values ​​of the circle of correlations
plt.grid(visible=True)
plt.show()

Plotting new points in a subspace after dimensionality reduction

I would like to plot points with 100 parameters each with values between 0-99 on a 2 dimensional plot. This should be straightforward with normal methods of dimensionality reduction (PCA/tSNE/UMAP etc) but I need to be able to add subsequent points to the plot without it needing to recalculate and therefore change
I am picturing an algorithm that takes a data-point with it's 100 values and converts it to X,Y coordinates that can then be plotted. Points proximal in the 2D projection are proximal in the original 100D space. Does such an algorithm exist? If not, any alternative approaches?
Thanks
I am not sure I understood the question correctly but with an initial set X, we can fit a PCA to compute the principal components. Then, we can use these principal components to transform new samples.
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
n_samples, n_feats = 50, 100
X = np.random.randint(0, 99, size=n_samples * n_feats).reshape(n_samples, n_feats)
pca = PCA(n_components=2).fit(X)
X_reduced = pca.transform(X)
plt.scatter(X[:, 0], X[:, 1])
This plots,
Then, when a new sample comes in
new_sample = np.random.randint(0, 99, size=100).reshape(1, 100)
new_sample_reduced = pca.transform(new_sample)
plt.scatter(new_sample_reduced[:, 0], new_sample_reduced[:, 1], color="red")
We can plot it

How To Plot n Furthest Points From Each Centroid KMeans

I am trying to train a kmeans model on the iris dataset in Python.
Is there a way to plot n furthest points from each centroid using kmeans in Python?
Here is a fully working code:
from sklearn import datasets
from sklearn.cluster import KMeans
import numpy as np
# import iris dataset
iris = datasets.load_iris()
X = iris.data[:, 2:5] # use two variables
# plot the two variables to check number of clusters
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1])
# kmeans
km = KMeans(n_clusters = 2, random_state = 0) # Chose two clusters
y_pred = km.fit_predict(X)
X_dist = kmeans.transform(X) # get distances to each centroid
## Stuck at this point: How to make a function that extracts three points that are furthest from the two centroids
max3IdxArr = []
for label in np.unique(km.labels_):
X_label_indices = np.where(y_pred == label)[0]
# max3Idx = X_label_indices[np.argsort(X_dist[:3])] # This part is wrong
max3Idx = X_label_indices[np.argsort(X_dist[:3])] # This part is wrong
max3IdxArr.append(max3Idx)
max3IdxArr
# plot
plt.scatter(X[:, 0].iloc[max3IdxArr], X[:, 1].iloc[max3IdxArr])
what you did is np.argsort(X_dist[:3])
which already takes top three values from the unsorted X_dist hence you can
try taking x=np.argsort(x_dist) and
after sorting is done you could then try
x[:3]
feel free to ask,
if this isnt working
cheers

How to plot the output of k-means clustering of word embedding using python?

I have used gensims word embeddings to find vectors of each word. Then I used K-means to find clusters of word. There are close to 10,000 tokens/words and I want to plot them.
I want to plot the result in the following way:
Annotate points with name of words
Different color for clusters
Here is what I have done.
tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=500)#, random_state=13)
def tsne_plot(data):
"Creates and TSNE model and plots it"
data=data.sample(n = 500).reset_index()
word=data["word"]
cluster=data["clusters"]
data=data.drop(["clusters","word"],axis=1)
X = tsne.fit_transform(data)
plt.figure(figsize=(48, 48))
for i in range(len(X)):
plt.scatter(X[:,0][i],X[:,1][i],c=cluster[i])
plt.annotate(word[i],
xy=(X[:,0][i],X[:,1][i]),
xytext=(3, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()
tsne_plot(data)
Though it's annotating the words but failing to color different groups/clusters?
Anyother other approach which annoates with word anmes and colors different clusters?
This is how it's typically done; with annotations and rainbow colors.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
X = np.array([[5,3],
[10,15],
[15,12],
[24,10],
[30,45],
[85,70],
[71,80],
[60,78],
[55,52],
[80,91],])
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
print(kmeans.cluster_centers_)
print(kmeans.labels_)
#plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow')
data = X
labels = kmeans.labels_
#######################################################################
plt.subplots_adjust(bottom = 0.1)
plt.scatter(data[:, 0], data[:, 1], c=kmeans.labels_, cmap='rainbow')
for label, x, y in zip(labels, data[:, 0], data[:, 1]):
plt.annotate(
label,
xy=(x, y), xytext=(-20, 20),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc='red', alpha=0.5),
arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
plt.show()
#######################################################################
See the link below for all details.
https://stackabuse.com/k-means-clustering-with-scikit-learn/
See the link below for some samples of how to do annotations with characters, rather tan numbers.
https://nikkimarinsek.com/blog/7-ways-to-label-a-cluster-plot-python

Resources