I used KMeans for clustering as shown below, but I don't know to plot my clusters in a scatter plot.
Or like This plot too
My code is:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
documents = ["This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredible.",
"If you open 100 tab in google you get a smileyface.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
If I understand your question correctly, I think you might be looking to do something like this? I plotted the data, coloring by label, after converting to cluster distance space.
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import pandas as pd
documents = ["This little kitty came to play when I was eating at a restaurant.",
"Merley has the best squooshy kitten belly.",
"Google Translate app is incredible.",
"If you open 100 tab in google you get a smileyface.",
"Best cat photo I've ever taken.",
"Climbing ninja cat.",
"Impressed with google map feedback.",
"Key promoter extension for Google Chrome."]
df = pd.DataFrame(documents) # read in your data with pd.read_csv or if in list form like above do this
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df[0].values.astype('U')) # make sure you have unicode strings [0] is the column of the sentences
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=2000, n_init=20)
Xt = model.fit_transform(X)
# things with tf-idf score
X = X.toarray()
fns = np.array(vectorizer.get_feature_names()) # feature names/ordered by index
# retrieve labels with max score
labels = model.labels_
d = []
for n in sorted(np.unique(labels)):
t = X[(labels == n)].sum(axis=0) #max tf/idf score cumulative/cluster
words = fns[t == t.max()]
d.append(",".join(words))
t = Xt.T # cluster distance space X transpose to be plotted with mpl
### plot the clusters
fig, ax = plt.subplots(1,1)
cluster_color_dict = {0:'purple', 1 :'blue'} # change these to desired colors
for i in range(len(t[0])):
ax.scatter(t[0][i], t[1][i], c= cluster_color_dict[labels[i]], edgecolors='grey', lw = 0.5, s = 200)
p1 = [] # legend patches
for i in range(2):
print i
h = ax.scatter([],[], c= cluster_color_dict[i],
edgecolors= 'grey', lw = 0.5, s = 80, label = d[i])
p1.append(h)
l1 = ax.legend(handles = p1, title= 'cluster', bbox_to_anchor = (1,1), loc = 'upper left')
Related
So, here is my code:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from fitter import Fitter, get_common_distributions
df = pd.read_csv("project3.csv")
bins = [282.33, 594.33, 906.33, 1281.33, 15030.33, 1842.33, 2154.33, 2466.33, 2778.33, 3090.33, 3402.33]
#declaring
facecolor = '#EAEAEA'
color_bars = '#3475D0'
txt_color1 = '#252525'
txt_color2 = '#004C74'
fig, ax = plt.subplots(1, figsize=(16, 6), facecolor=facecolor)
ax.set_facecolor(facecolor)
n, bins, patches = plt.hist(df.City1, color=color_bars, bins=10)
#grid
minor_locator = AutoMinorLocator(2)
plt.gca().xaxis.set_minor_locator(minor_locator)
plt.grid(which='minor', color=facecolor, lw = 0.5)
xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.0f}-{:.0f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=xticks_labels, c=txt_color1, fontsize=13)
#beautify
ax.tick_params(axis='x', which='both',length=0)
plt.yticks([])
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
for idx, value in enumerate(n):
if value > 0:
plt.text(xticks[idx], value+5, int(value), ha='center', fontsize=16, c=txt_color1)
plt.title('Histogram of rainfall in City1\n', loc = 'right', fontsize = 20, c=txt_color1)
plt.xlabel('\nCentimeters of rainfall', c=txt_color2, fontsize=14)
plt.ylabel('Frequency of occurrence', c=txt_color2, fontsize=14)
plt.tight_layout()
#plt.savefig('City1_Raw.png', facecolor=facecolor)
plt.show()
city1 = df['City1'].values
f = Fitter(city1, distributions=get_common_distributions())
f.fit()
fig = f.plot_pdf(names=None, Nbest=4, lw=1, method='sumsquare_error')
plt.show()
print(f.get_best(method = 'sumsquare_error'))
The issue is with the plots it shows. The first histogram it generates is
Next I get another graph with best fitted distributions which is
Then an output statement
{'chi2': {'df': 10.692966790090342, 'loc': 16.690849400411103, 'scale': 118.71595997157786}}
Process finished with exit code 0
I have a couple of questions. Why is chi2, the best fitted distribution not plotted on the graph?
How do I plot these distributions on top of the histograms and not separately? The hist() function in fitter library can do that but there I don't get to control the bins and so I end up getting like 100 bins with some flat looking data.
How do I solve this issue? I need to plot the best fit curve on the histogram that looks like image1. Can I use any other module/package to get the work done in similar way? This uses least squares fit but I am OK with least likelihood or log likelihood too.
Simple way of plotting things on top of each other (using some properties of the Fitter class)
import scipy.stats as st
import matplotlib.pyplot as plt
from fitter import Fitter, get_common_distributions
from scipy import stats
numberofpoints=50000
df = stats.norm.rvs( loc=1090, scale=500, size=numberofpoints)
fig, ax = plt.subplots(1, figsize=(16, 6))
n, bins, patches = ax.hist( df, bins=30, density=True)
f = Fitter(df, distributions=get_common_distributions())
f.fit()
errorlist = sorted(
[
[f._fitted_errors[dist], dist]
for dist in get_common_distributions()
]
)[:4]
for err, dist in errorlist:
ax.plot( f.x, f.fitted_pdf[dist] )
plt.show()
Using the histogram normalization, one would need to play with scaling to generalize again.
I have loaded and plotted a FITS file in python.
With the help of a previous post, I have managed to get the conversion of the axis from pixels to celestial coordinates. But I can't manage to get them in milliarcseconds (mas) correctly.
The code is the following
import numpy as np
import matplotlib.pyplot as plt
import astropy.units as u
from astropy.wcs import WCS
from astropy.io import fits
from astropy.utils.data import get_pkg_data_filename
filename = get_pkg_data_filename('hallo.fits')
hdu = fits.open(filename)[0]
wcs = WCS(hdu.header).celestial
wcs.wcs.crval = [0,0]
plt.subplot(projection=wcs)
plt.imshow(hdu.data[0][0], origin='lower')
plt.xlim(200,800)
plt.ylim(200,800)
plt.xlabel('Relative R.A ()')
plt.ylabel('Relative Dec ()')
plt.colorbar()
The output looks like
The y-label is cut for some reason, I do not know.
As it was shown in another post, one could use
wcs.wcs.ctype = [ 'XOFFSET' , 'YOFFSET' ]
to switch it to milliarcsecond, and I get
but the scale is incorrect!.
For instance, 0deg00min00.02sec should be 20 mas and not 0.000002!
Did I miss something here?
Looks like a spectral index map. Nice!
I think the issue might be that FITS implicitly uses degrees for values like CDELT. And they should be converted to mas explicitly for the plot.
The most straightforward way is to multiply CDELT values by 3.6e6 to convert from degrees to mas.
However, there is a more general approach which could be useful if you want to convert to different units at some point:
import astropy.units as u
w.wcs.cdelt = (w.wcs.cdelt * u.deg).to(u.mas)
So it basically says first that the units of CDELT are degrees and then converts them to mas.
The whole workflow is like this:
def make_transform(f):
'''use already read-in FITS file object f to build pixel-to-mas transformation'''
print("Making a transformation out of a FITS header")
w = WCS(f[0].header)
w = w.celestial
w.wcs.crval = [0, 0]
w.wcs.ctype = [ 'XOFFSET' , 'YOFFSET' ]
w.wcs.cunit = ['mas' , 'mas']
w.wcs.cdelt = (w.wcs.cdelt * u.deg).to(u.mas)
print(w.world_axis_units)
return w
def read_fits(file):
'''read fits file into object'''
try:
res = fits.open(file)
return res
except:
return None
def start_plot(i,df=None, w=None, xlim = [None, None], ylim=[None, None]):
'''starts a plot and returns fig,ax .
xlim, ylim - axes limits in mas
'''
# make a transformation
# Using a dataframe
if df is not None:
w = make_transform_df(df)
# using a header
if w is not None:
pass
# not making but using one from the arg list
else:
w = make_transform(i)
# print('In start_plot using the following transformation:\n {}'.format(w))
fig = plt.figure()
if w.naxis == 4:
ax = plt.subplot(projection = w, slices = ('x', 'y', 0 ,0 ))
elif w.naxis == 2:
ax = plt.subplot(projection = w)
# convert xlim, ylim to coordinates of BLC and TRC, perform transformation, then return back to xlim, ylim in pixels
if any(xlim) and any(ylim):
xlim_pix, ylim_pix = limits_mas2pix(xlim, ylim, w)
ax.set_xlim(xlim_pix)
ax.set_ylim(ylim_pix)
fig.add_axes(ax) # note that the axes have to be explicitly added to the figure
return fig, ax
rm = read_fits(file)
wr = make_transform(rm)
fig, ax = start_plot(RM, w=wr, xlim = xlim, ylim = ylim)
Then just plot to the axes ax with imshow or contours or whatever.
Of course, this piece of code could be reduced to meet your particular needs.
I am trying to train a kmeans model on the iris dataset in Python.
Is there a way to plot n furthest points from each centroid using kmeans in Python?
Here is a fully working code:
from sklearn import datasets
from sklearn.cluster import KMeans
import numpy as np
# import iris dataset
iris = datasets.load_iris()
X = iris.data[:, 2:5] # use two variables
# plot the two variables to check number of clusters
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1])
# kmeans
km = KMeans(n_clusters = 2, random_state = 0) # Chose two clusters
y_pred = km.fit_predict(X)
X_dist = kmeans.transform(X) # get distances to each centroid
## Stuck at this point: How to make a function that extracts three points that are furthest from the two centroids
max3IdxArr = []
for label in np.unique(km.labels_):
X_label_indices = np.where(y_pred == label)[0]
# max3Idx = X_label_indices[np.argsort(X_dist[:3])] # This part is wrong
max3Idx = X_label_indices[np.argsort(X_dist[:3])] # This part is wrong
max3IdxArr.append(max3Idx)
max3IdxArr
# plot
plt.scatter(X[:, 0].iloc[max3IdxArr], X[:, 1].iloc[max3IdxArr])
what you did is np.argsort(X_dist[:3])
which already takes top three values from the unsorted X_dist hence you can
try taking x=np.argsort(x_dist) and
after sorting is done you could then try
x[:3]
feel free to ask,
if this isnt working
cheers
I have trained my model using Gensim. I draw a 2D plot using PCA but it is not clear too much. I wanna change it to 3D with capable of zooming .my result is so dense.
from sklearn.decomposition import PCA
from matplotlib import pyplot
X=model[model.wv.vocab]
pca=PCA(n_components=2)
result=pca.fit_transform(X)
pyplot.scatter(result[:,0],result[:,1])
word=list(model.wv.most_similar('eden_lake'))
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
And the result:
it possible to do that?
The following function uses t-SNE instead of PCA for dimension reduction, but will generate a plot in two, three or both two and three dimensions (using subplots). Furthermore, it will color the topics for you so it's easier to distinguish them. Adding %matplotlib notebook to the start of a Jupyter notebook environment from anaconda will allow a 3d plot to be rotated and a 2d plot to be zoomed (don't do both versions at the same time with %matplotlib notebook).
The function is very long, with most of the code being for plot formatting, but produces a professional output.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from gensim.models import LdaModel
from gensim import corpora
from sklearn.manifold import TSNE
# %matplotlib notebook # if in Jupyter for rotating and zooming
def LDA_tSNE_topics_vis(dimension='both',
corpus=None,
num_topics=10,
remove_3d_outliers=False,
save_png=False):
"""
Returns the outputs of an LDA model plotted using t-SNE (t-distributed Stochastic Neighbor Embedding)
Note: t-SNE reduces the dimensionality of a space such that similar points will be closer and dissimilar points farther
Parameters
----------
dimension : str (default=both)
The dimension that t-SNE should reduce the data to for visualization
Options: 2d, 3d, and both (a plot with two subplots)
corpus : list, list of lists
The tokenized and cleaned text corpus over which analysis should be done
num_topics : int (default=10)
The number of categories for LDA based approaches
remove_3d_outliers : bool (default=False)
Whether to remove outliers from a 3d plot
save_png : bool (default=False)
Whether to save the figure as a png
Returns
-------
A t-SNE lower dimensional representation of an LDA model's topics and their constituent members
"""
dirichlet_dict = corpora.Dictionary(corpus)
bow_corpus = [dirichlet_dict.doc2bow(text) for text in corpus]
dirichlet_model = LdaModel(corpus=bow_corpus,
id2word=dirichlet_dict,
num_topics=num_topics,
update_every=1,
chunksize=len(bow_corpus),
passes=10,
alpha='auto',
random_state=42) # set for testing
df_topic_coherences = pd.DataFrame(columns = ['topic_{}'.format(i) for i in range(num_topics)])
for i in range(len(bow_corpus)):
df_topic_coherences.loc[i] = [0] * num_topics
output = dirichlet_model.__getitem__(bow=bow_corpus[i], eps=0)
for j in range(len(output)):
topic_num = output[j][0]
coherence = output[j][1]
df_topic_coherences.iloc[i, topic_num] = coherence
for i in range(num_topics):
df_topic_coherences.iloc[:, i] = df_topic_coherences.iloc[:, i].astype('float64', copy=False)
df_topic_coherences['main_topic'] = df_topic_coherences.iloc[:, :num_topics].idxmax(axis=1)
if num_topics > 10:
# cubehelix better for more than 10 colors
colors = sns.color_palette("cubehelix", num_topics)
else:
# The default sns color palette
colors = sns.color_palette('deep', num_topics)
tsne_2 = None
tsne_3 = None
if dimension == 'both':
tsne_2 = TSNE(n_components=2, perplexity=40, n_iter=300)
tsne_3 = TSNE(n_components=3, perplexity=40, n_iter=300)
elif dimension == '2d':
tsne_2 = TSNE(n_components=2, perplexity=40, n_iter=300)
elif dimension == '3d':
tsne_3 = TSNE(n_components=3, perplexity=40, n_iter=300)
else:
ValueError("An invalid value has been passed to the 'dimension' argument - choose from 2d, 3d, or both.")
if tsne_2 is not None:
tsne_results_2 = tsne_2.fit_transform(df_topic_coherences.iloc[:, :num_topics])
df_tsne_2 = pd.DataFrame()
df_tsne_2['tsne-2d-d1'] = tsne_results_2[:,0]
df_tsne_2['tsne-2d-d2'] = tsne_results_2[:,1]
df_tsne_2['main_topic'] = df_topic_coherences.iloc[:, num_topics]
df_tsne_2['color'] = [colors[int(t.split('_')[1])] for t in df_tsne_2['main_topic']]
df_tsne_2['topic_num'] = [int(i.split('_')[1]) for i in df_tsne_2['main_topic']]
df_tsne_2 = df_tsne_2.sort_values(['topic_num'], ascending = True).drop('topic_num', axis=1)
if tsne_3 is not None:
colors = [c for c in sns.color_palette()]
tsne_results_3 = tsne_3.fit_transform(df_topic_coherences.iloc[:, :num_topics])
df_tsne_3 = pd.DataFrame()
df_tsne_3['tsne-3d-d1'] = tsne_results_3[:,0]
df_tsne_3['tsne-3d-d2'] = tsne_results_3[:,1]
df_tsne_3['tsne-3d-d3'] = tsne_results_3[:,2]
df_tsne_3['main_topic'] = df_topic_coherences.iloc[:, num_topics]
df_tsne_3['color'] = [colors[int(t.split('_')[1])] for t in df_tsne_3['main_topic']]
df_tsne_3['topic_num'] = [int(i.split('_')[1]) for i in df_tsne_3['main_topic']]
df_tsne_3 = df_tsne_3.sort_values(['topic_num'], ascending = True).drop('topic_num', axis=1)
if remove_3d_outliers:
# Remove those rows with values that are more than three standard deviations from the column mean
for col in ['tsne-3d-d1', 'tsne-3d-d2', 'tsne-3d-d3']:
df_tsne_3 = df_tsne_3[np.abs(df_tsne_3[col] - df_tsne_3[col].mean()) <= (3 * df_tsne_3[col].std())]
if tsne_2 is not None and tsne_3 is not None:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, # pylint: disable=unused-variable
figsize=(20,10))
ax1.axis('off')
else:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,10))
if tsne_2 is not None and tsne_3 is not None:
# Plot tsne_2, with tsne_3 being added later
ax1 = sns.scatterplot(data=df_tsne_2, x="tsne-2d-d1", y="tsne-2d-d2",
hue=df_topic_coherences.iloc[:, num_topics], alpha=0.3)
light_grey_tup = (242/256, 242/256, 242/256)
ax1.set_facecolor(light_grey_tup)
ax1.axes.set_title('t-SNE 2-Dimensional Representation', fontsize=25)
ax1.set_xlabel('tsne-d1', fontsize=20)
ax1.set_ylabel('tsne-d2', fontsize=20)
handles, labels = ax1.get_legend_handles_labels()
legend_order = list(np.argsort([i.split('_')[1] for i in labels]))
ax1.legend([handles[i] for i in legend_order], [labels[i] for i in legend_order],
facecolor=light_grey_tup)
elif tsne_2 is not None:
# Plot just tsne_2
ax = sns.scatterplot(data=df_tsne_2, x="tsne-2d-d1", y="tsne-2d-d2",
hue=df_topic_coherences.iloc[:, num_topics], alpha=0.3)
ax.set_facecolor(light_grey_tup)
ax.axes.set_title('t-SNE 2-Dimensional Representation', fontsize=25)
ax.set_xlabel('tsne-d1', fontsize=20)
ax.set_ylabel('tsne-d2', fontsize=20)
handles, labels = ax.get_legend_handles_labels()
legend_order = list(np.argsort([i.split('_')[1] for i in labels]))
ax.legend([handles[i] for i in legend_order], [labels[i] for i in legend_order],
facecolor=light_grey_tup)
if tsne_2 is not None and tsne_3 is not None:
# tsne_2 has been plotted, so add tsne_3
ax2 = fig.add_subplot(121, projection='3d')
ax2.scatter(xs=df_tsne_3['tsne-3d-d1'],
ys=df_tsne_3['tsne-3d-d2'],
zs=df_tsne_3['tsne-3d-d3'],
c=df_tsne_3['color'],
alpha=0.3)
ax2.set_facecolor('white')
ax2.axes.set_title('t-SNE 3-Dimensional Representation', fontsize=25)
ax2.set_xlabel('tsne-d1', fontsize=20)
ax2.set_ylabel('tsne-d2', fontsize=20)
ax2.set_zlabel('tsne-d3', fontsize=20)
with plt.rc_context({"lines.markeredgewidth" : 0}):
# Add handles via blank lines and order their colors to match tsne_2
proxy_handles = [Line2D([0], [0], linestyle="none", marker='o', markersize=8,
markerfacecolor=colors[i]) for i in legend_order]
ax2.legend(proxy_handles, ['topic_{}'.format(i) for i in range(num_topics)],
loc='upper left', facecolor=(light_grey_tup))
elif tsne_3 is not None:
# Plot just tsne_3
ax.axis('off')
ax.set_facecolor('white')
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=df_tsne_3['tsne-3d-d1'],
ys=df_tsne_3['tsne-3d-d2'],
zs=df_tsne_3['tsne-3d-d3'],
c=df_tsne_3['color'],
alpha=0.3)
ax.set_facecolor('white')
ax.axes.set_title('t-SNE 3-Dimensional Representation', fontsize=25)
ax.set_xlabel('tsne-d1', fontsize=20)
ax.set_ylabel('tsne-d2', fontsize=20)
ax.set_zlabel('tsne-d3', fontsize=20)
with plt.rc_context({"lines.markeredgewidth" : 0}):
# Add handles via blank lines
proxy_handles = [Line2D([0], [0], linestyle="none", marker='o', markersize=8,
markerfacecolor=colors[i]) for i in range(len(colors))]
ax.legend(proxy_handles, ['topic_{}'.format(i) for i in range(num_topics)],
loc='upper left', facecolor=light_grey_tup)
if save_png:
plt.savefig('LDA_tSNE_{}.png'.format(time.strftime("%Y%m%d-%H%M%S")), bbox_inches='tight', dpi=500)
plt.show()
An example plot for both 2d and 3d (with outliers removed) representations of a 10 topic gensim LDA model on subplots would be:
Yes, in principle it is possible to do 3D visualization of LDA model results. Here is more information about using T-SNE for that.
I am starting with Spark Linear Regression. I am trying to fit a line to a linear dataset. It seems that the intercept is not correctly adjusting, or probably I am missing something..
With intercept=False:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=False)
This seems normal. But when I use intercept=True:
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=100, step=0.0001, intercept=True)
The model that I get in the last case is exactly:
(weights=[0.0353471289751], intercept=1.0005127185289888)
I have tried with different datasets, step sizes and iterations, but always the model converges the intercept is about 1
EDIT - This is the code I am using:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
import numpy as np
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc = SparkContext("local", "regression")
# Generate data
SIZE = 300
SLOPE = 0.1
BASE = -30
NOISE = 10
x = np.arange(SIZE)
delta = np.random.uniform(-NOISE,NOISE, size=(SIZE,))
y = BASE + SLOPE*x + delta
data = zip(range(len(y)), y) # zip with index
dataRDD = sc.parallelize(data)
# Normalize data
# mean = np.mean(data)
# std = np.std(data)
# dataRDD = dataRDD.map(lambda r: (r[0], (float(r[1])-mean)/std))
labeledData = dataRDD.map(lambda r: LabeledPoint(float(r[1]), [float(r[0])]))
# Create linear model
linear_model = LinearRegressionWithSGD.train(labeledData, iterations=1000, step=0.0002, intercept=True, convergenceTol=0.000001)
print linear_model
true_vs_predicted = labeledData.map(lambda p: (p.label, linear_model.predict(p.features))).collect()
# PLOT
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid()
y_real = [x[0] for x in true_vs_predicted]
y_pred = [x[1] for x in true_vs_predicted]
plt.plot(range(len(y_real)), y_real, 'o', markersize=5, c='b')
plt.plot(range(len(y_pred)), y_pred, 'o', markersize=5, c='r')
plt.show()
This is because the number of iterations and the step size are both smaller. As a result, The trial process is ending before reaching the local optima.