Extract top words for each cluster - python-3.x

I have done K-means clustering for text data
#K-means clustering
from sklearn.cluster import KMeans
num_clusters = 4
km = KMeans(n_clusters=num_clusters)
%time km.fit(features)
clusters = km.labels_.tolist()
where features is the tf-idf vector
#preprocessing text - converting to a tf-idf vector form
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=0.01,max_df=0.75, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.keywrds).toarray()
labels = df.CD
Then I added the cluster label to original dataset
df['clusters'] = clusters
And indexed the dataframe by clusters
pd.DataFrame(df,index = [clusters])
How do I fetch the top words for each cluster?

This is not really the top words in each cluster but orders them by most frequent words. Then you can just the first word as a word group instead of a cluster num.
built a dict with all feature names and tfidf score
for f, w in zip(tfidf.get_feature_names(), tfidf.idf_):
featurenames[len(f.split(' '))].append((f, w))
featurenames = dict(featurenames[1])
rounded off feature idf values cause they were a little long
featurenames = dict(zip(featurenames.keys(), [round(v, 4) for v in featurenames.values()]))
converted dict to df
dffeatures = pd.DataFrame.from_dict(featurenames, orient='index').reset_index() \
.rename(columns={'index': 'featurename',0:'featureid'})
dffeatures = dffeatures.round(4)
combined feature word with id and created a new dictionary. I did this to accommodate for duplicate id's.
dffeatures['combined'] = dffeatures.apply(lambda x:'%s:%s' % (x['featureid'],x['featurename']),axis=1)
featurenamesnew = pd.Series(dffeatures.combined.values, index=dffeatures.featurename).to_dict()
{'cat': '2.3863:cat', 'cow': '3.0794:cow', 'dog': '2.674:dog'....}
created a new col in the df and replaced all word with idf:feature value
df['temp'] = df['inputdata'].replace(featurenamesnew, regex=True)
ordered the df idf:feature value ascending so most frequent words appear first
df['temp'] = df['temp'].str.split().apply(lambda x: sorted(set(x), reverse=False)).str.join(' ').to_frame()
reverese map idf:featurevalue with the words
inv_map = {v: k for k, v in featurenamesnew.items()}
df['cluster_top_n_words'] = df['temp'].replace(inv_map, regex=True)
finally keep top n words in the new df col
df['cluster_top_n_words'] = df['cluster_top_n_words'].apply(lambda x: ' '.join(x.split()[:3]))

Related

Extraction of N most frequent keywords per cluster in Hierarchical Clustering NLP

I want to extract n most frequent keywords per cluster from the results of Agglomerative hiearchichal clustering.
def agglomerative_clustering(tfidf_matrix):
cluster = AgglomerativeClustering(n_clusters=95, affinity='euclidean', linkage='ward')
cluster.fit_predict(tfidf_matrix)
print(cluster.n_clusters_)
labels=cluster.labels_
print("lables is "+str(labels.shape))
#labels = list(labels)[0]
print("test"+str(labels))
return labels
def tfidf(data):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(data)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
return vectors,feature_names
vectors,terms=tfidf(cleaned_documents)
labels =agglomerative_clustering(vectors.toarray())
lib['cleaned_documents'] = pd.Series(cleaned_documents)
lib['clusterAgglomerative']= pd.Series(labels)
X = pd.DataFrame(vectorized_data.toarray(),lib['cleaned_documents']) # columns argument is optional
X['Cluster'] = labels
# Add column corresponding to cluster number
word_frequencies_by_cluster = X.groupby('Cluster').sum()
# To get sorted list for a numbered cluster, in this case 1
print("Top terms per cluster:")
print(word_frequencies_by_cluster.loc[2, :].sort_values(ascending=False))
The results i want each Cluster with the N most frequent keywords ?
i tried this solution but seems it's not efficient
df_lib = pd.DataFrame(lib['cleaned_documents'],lib['clusterAgglomerative'])
print(df_lib)
grouped_df = df_lib.groupby("clusterAgglomerative")
grouped_lists = (grouped_df["cleaned_documents"]).agg(lambda column: ", ".join(set(column)))
print("keywords per cluster")
print(grouped_lists)

Sort simmilarity matrix according to plot colors

I have this similarity matrix plot of some documents. I want to sort the values of the matrix, which is a numpynd array, to group colors, while maintaining their relative position (diagonal yellow line), and labels as well.
path = "C:\\Users\\user\\Desktop\\texts\\dataset"
text_files = os.listdir(path)
#print (text_files)
tfidf_vectorizer = TfidfVectorizer()
documents = [open(f, encoding="utf-8").read() for f in text_files if f.endswith('.txt')]
sparse_matrix = tfidf_vectorizer.fit_transform(documents)
labels = []
for f in text_files:
if f.endswith('.txt'):
labels.append(f)
pairwise_similarity = sparse_matrix * sparse_matrix.T
pairwise_similarity_array = pairwise_similarity.toarray()
fig, ax = plt.subplots(figsize=(20,20))
cax = ax.matshow(pairwise_similarity_array, interpolation='spline16')
ax.grid(True)
plt.title('News articles similarity matrix')
plt.xticks(range(23), labels, rotation=90);
plt.yticks(range(23), labels);
fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
plt.show()
Here is one possibility.
The idea is to use the information in the similarity matrix and put elements next to each other if they are similar. If two items are similar they should also be similar with respect to other elements ie have similar colors.
I start with the element which has the most in common with all other elements (this choice is a bit arbitrary) [a] and as next element I choose from the remaining elements the one which is closest to the current [b].
import numpy as np
import matplotlib.pyplot as plt
def create_dummy_sim_mat(n):
sm = np.random.random((n, n))
sm = (sm + sm.T) / 2
sm[range(n), range(n)] = 1
return sm
def argsort_sim_mat(sm):
idx = [np.argmax(np.sum(sm, axis=1))] # a
for i in range(1, len(sm)):
sm_i = sm[idx[-1]].copy()
sm_i[idx] = -1
idx.append(np.argmax(sm_i)) # b
return np.array(idx)
n = 10
sim_mat = create_dummy_sim_mat(n=n)
idx = argsort_sim_mat(sim_mat)
sim_mat2 = sim_mat[idx, :][:, idx] # apply reordering for rows and columns
# Plot results
fig, ax = plt.subplots(1, 2)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat2)
def ticks(_ax, ti, la):
_ax.set_xticks(ti)
_ax.set_yticks(ti)
_ax.set_xticklabels(la)
_ax.set_yticklabels(la)
ticks(_ax=ax[0], ti=range(n), la=range(n))
ticks(_ax=ax[1], ti=range(n), la=idx)
After meTchaikovsky's answer I also tested my idea on a clustered similarity matrix (see first image) this method works but is not perfect (see second image).
Because I use the similarity between two elements as approximation to their similarity to all other elements, it is quite clear why this does not work perfectly.
So instead of using the initial similarity to sort the elements one could calculate a second order similarity matrix which measures how similar the similarities are (sorry).
This measure describes better what you are interested in. If two rows / columns have similar colors they should be close to each other. The algorithm to sort the matrix is the same as before
def add_cluster(sm, c=3):
idx_cluster = np.array_split(np.random.permutation(np.arange(len(sm))), c)
for ic in idx_cluster:
cluster_noise = np.random.uniform(0.9, 1.0, (len(ic),)*2)
sm[ic[np.newaxis, :], ic[:, np.newaxis]] = cluster_noise
def get_sim_mat2(sm):
return 1 / (np.linalg.norm(sm[:, np.newaxis] - sm[np.newaxis], axis=-1) + 1/n)
sim_mat = create_dummy_sim_mat(n=100)
add_cluster(sim_mat, c=4)
sim_mat2 = get_sim_mat2(sim_mat)
idx = argsort_sim_mat(sim_mat)
idx2 = argsort_sim_mat(sim_mat2)
sim_mat_sorted = sim_mat[idx, :][:, idx]
sim_mat_sorted2 = sim_mat[idx2, :][:, idx2]
# Plot results
fig, ax = plt.subplots(1, 3)
ax[0].imshow(sim_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(sim_mat_sorted2)
The results with this second method are quite good (see third image)
but I guess there exist cases where this approach also fails, so I would be happy about feedback.
Edit
I tried to explain it and did also link the ideas to the code with [a] and [b], but obviously I did not do a good job, so here is a second more verbose explanation.
You have n elements and a n x n similarity matrix sm where each cell (i, j) describes how similar element i is to element j. The goal is to order the rows / columns in such a way that one can see existing patterns in the similarity matrix. My idea to achieve this is really simple.
You start with an empty list and add elements one by one. The criterion for the next element is the similarity to the current element. If element i was added in the last step, I chose the element argmax(sm[i, :]) as next, ignoring the elements already added to the list. I ignore the elements by setting the values of those elements to -1.
You can use the function ticks to reorder the labels:
labels = np.array(labels) # make labels an numpy array, to index it with a list
ticks(_ax=ax[0], ti=range(n), la=labels[idx])
#scleronomic's solution is very elegant, but it also has one shortage, which is we cannot set the number of clusters in the sorted correlation matrix. Assume we are working with a set of variables, in which some of them are weakly correlated
import string
import numpy as np
import pandas as pd
n_variables = 20
n_clusters = 10
n_samples = 100
np.random.seed(100)
names = list(string.ascii_lowercase)[:n_variables]
belongs_to_cluster = np.random.randint(0,n_clusters,n_variables)
latent = np.random.randn(n_clusters,n_samples)
variables = np.random.rand(n_variables,n_samples)
for ind in range(n_clusters):
mask = belongs_to_cluster == ind
# weakening the correlation
if ind % 2 == 0:variables[mask] += latent[ind]*0.1
variables[mask] += latent[ind]
df = pd.DataFrame({key:val for key,val in zip(names,variables)})
corr_mat = np.array(df.corr())
As you can see, there are 10 clusters of variables by construction, however, variables within clusters that has an even index are weakly correlated. If we only want to see roughly 5 clusters in the sorted correlation matrix, maybe we need to find another way.
Based on this post, which is the accepted answer to the question "Clustering a correlation matrix", to sort a correlation matrix into blocks, what we need to find are blocks, where correlations within blocks are high and correlations between blocks are low. However, the solution provided by this accepted answer works best when we know how many blocks are there in the first place, and more importantly, the sizes of the underlying blocks are the same, or at least similar. Therefore, I improved the solution with a new function sort_corr_mat
def sort_corr_mat(corr_mat,clusters_guess):
def _swap_rows(corr_mat, var1, var2):
rs = corr_mat.copy()
rs[var2, :],rs[var1, :]= corr_mat[var1, :],corr_mat[var2, :]
cs = rs.copy()
cs[:, var2],cs[:, var1] = rs[:, var1],rs[:, var2]
return cs
# analysis
max_iter = 500
best_score,current_score,best_count = -1e8,-1e8,0
num_minimua_to_visit = 20
best_corr = corr_mat
best_ordering = np.arange(n_variables)
for i in range(max_iter):
for row1 in range(n_variables):
for row2 in range(n_variables):
if row1 == row2: continue
option_ordering = best_ordering.copy()
option_ordering[row1],option_ordering[row2] = best_ordering[row2],best_ordering[row1]
option_corr = _swap_rows(best_corr,row1,row2)
option_score = score(option_corr,n_variables,clusters_guess)
if option_score > best_score:
best_corr = option_corr
best_ordering = option_ordering
best_score = option_score
if best_score > current_score:
best_count += 1
current_corr = best_corr
current_ordering = best_ordering
current_score = best_score
if best_count >= num_minimua_to_visit:
return best_corr#,best_ordering
return best_corr#,best_ordering
With this function and the corr_mat constructed in the first place, I compared the result obtained with my function (on the right) with that obtained with #scleronomic's solution (in the middle)
sim_mat_sorted = corr_mat[argsort_sim_mat(corr_mat), :][:, argsort_sim_mat(corr_mat)]
corr_mat_sorted = sort_corr_mat(corr_mat,clusters_guess=5)
# Plot results
fig, ax = plt.subplots(1,3,figsize=(18,6))
ax[0].imshow(corr_mat)
ax[1].imshow(sim_mat_sorted)
ax[2].imshow(corr_mat_sorted)
Clearly, #scleronomic's solution works much better and faster, but my solution offers more control to the pattern of the output.

How to simplify text comparison for big data-set where text meaning is same but not exact - deduplicate text data

I have text data set (different menu items like chocolate, cake, coke etc) of around 1.8 million records which belongs to 6 different categories (category A, B, C, D, E, F). one of the category has around 700k records. Most of the menu items are mixed up in multiple categories to which they doesn't belong to, for example: cake belongs to category 'A' but it is found in category 'B' & 'C' as well.
I want to identify those misclassified items and report to a personnel but the challenge is the item name is not always correct because it is totally human typed text. For example: Chocolate might be updated as hot chclt, sweet choklate, chocolat etc. There can also be items like chocolate cake ;)
so to handle this, I tried a simple method using cosine similarity to compare category-wise and identify those anomalies but it takes alot of time since I am comparing each items to 1.8 million records (Sample code is as shown below). Can anyone suggest a better way to deal with this problem?
#Function
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def cos_similarity(a,b):
X =a
Y =b
# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)
# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]
# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}
# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
if w in X_set: l1.append(1) # create a vector
else: l1.append(0)
if w in Y_set: l2.append(1)
else: l2.append(0)
c = 0
# cosine formula
for i in range(len(rvector)):
c+= l1[i]*l2[i]
if float((sum(l1)*sum(l2))**0.5)>0:
cosine = c / float((sum(l1)*sum(l2))**0.5)
else:
cosine = 0
return cosine
#Base code
cos_sim_list = []
for i in category_B.index:
ln_cosdegree = 0
ln_degsem = []
for j in category_A.index:
ln_j = str(category_A['item_name'][j])
ln_i = str(category_B['item_name'][i])
degreeOfSimilarity = cos_similarity(ln_j,ln_i)
if degreeOfSimilarity>0.5:
cos_sim_list.append([ln_j,ln_i,degreeOfSimilarity])
Consider text is already cleaned
I used KNeighbor and cosine similarity to solve this case. Though I am running the code multiple times to compare category by category; still it is effective because of lesser number of categories. Please suggest me if any better solution is available
cat_A_clean = category_A['item_name'].unique()
print('Vecorizing the data - this could take a few minutes for large datasets...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(cat_A_clean)
print('Vecorizing completed...')
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
unique_B = set(category_B['item_name'].values)
def getNearestN(query):
queryTFIDF_ = vectorizer.transform(query)
distances, indices = nbrs.kneighbors(queryTFIDF_)
return distances, indices
import time
t1 = time.time()
print('getting nearest n...')
distances, indices = getNearestN(unique_B)
t = time.time()-t1
print("COMPLETED IN:", t)
unique_B = list(unique_B)
print('finding matches...')
matches = []
for i,j in enumerate(indices):
temp = [round(distances[i][0],2), cat_A_clean['item_name'].values[j],unique_B[i]]
matches.append(temp)
print('Building data frame...')
matches = pd.DataFrame(matches, columns=['Match confidence (lower is better)','ITEM_A','ITEM_B'])
print('Done')
def clean_string(text):
text = str(text)
text = text.lower()
return(text)
def cosine_sim_vectors(vec1,vec2):
vec1 = vec1.reshape(1,-1)
vec2 = vec2.reshape(1,-1)
return cosine_similarity(vec1,vec2)[0][0]
def cos_similarity(sentences):
cleaned = list(map(clean_string,sentences))
print(cleaned)
vectorizer = CountVectorizer().fit_transform(cleaned)
vectors = vectorizer.toarray()
print(vectors)
return(cosine_sim_vectors(vectors[0],vectors[1]))
cos_sim_list =[]
for ind in matches.index:
a = matches['Match confidence (lower is better)'][ind]
b = matches['ITEM_A'][ind]
c = matches['ITEM_B'][ind]
degreeOfSimilarity = cos_similarity([b,c])
cos_sim_list.append([a,b,c,degreeOfSimilarity])

Found array with 0 feature(s) (shape=(268215, 0)) while a minimum of 1 is required by StandardScaler

I am solving a problem where I am pulling data of all the ProductIDs and then I iterate through the dataframe to look at unique ProductIDs to perform a set of functions.
Here, item is the ProductID/Item number:
#looping through the big dataframe to get a dataframe pertaining to the unique ID
for item in df2['Item Nbr'].unique():
# fetch item data
df = df2.loc[df2['Item Nbr'] == item]
And then I have a set of custom made python functions:
So, when I get through the first loop (for one productID) it works all great, but when it iterates through the loop and goes to the next Product ID, I am certain that the data it is pulling out is right, but I get this error:
Found array with 0 feature(s) (shape=(268215, 0)) while a minimum of 1 is required by StandardScaler.
Although, the X_train and y_train shapes are : (268215, 6) (268215,)
Code Snippet : (Extra Information)
It is a huge file to show. But the initial big dataframe has
[362988 rows x 7 columns] - for first product and
[268215 rows x 7 columns] - for second product
Expansion of the code:
the big dataframe with two unique product IDS
biqQueryData = get_item_data(verbose=True)
iterate over each unique product ID for extracting a subset of dataframes that pertain to the product ID
for item in biqQueryData['Item Nbr'].unique():
df = biqQueryData.loc[biqQueryData['Item Nbr'] == item]
try:
df_model = model_all_stores(df, item, n_jobs=n_jobs,
train_model=train_model,
test_model=test_model,
tune_model=tune_model,
export_model=export_model,
output=export_demand)
the function model_all_stores
def model_all_stores(df_raw, item_nbr, n_jobs=1, train_model=False,
test_model=False, export_model=False, output=False,
tune_model=False):
"""Models demand for specified item.
Predict the demand of specified item for all stores. Does not
filter for predict hidden demand (the function get_hidden_demand
should be used for this.)
Output: data frame output
"""
# ML model hyperparameters
impute_with = 'median'
n_estimators = 100
min_samples_split = 3
min_samples_leaf = 3
max_depth = None
# load data and subset traited and valid
dfnew = subset_traited_valid(df_raw)
# get known demand
df_ma = get_demand(dfnew)
# impute missing sales data
median_sales = df_ma['Sales Qty'].median()
df_ma['Sales Qty'] = df_ma['Sales Qty'].fillna(median_sales)
# add moving average features
df_ma = df_ma.sort_values('Gregorian Days')
window_list = [7 * x for x in [1, 2, 4, 8, 16, 52]]
for w in window_list:
grouped = df_ma.groupby('Store Nbr')['Sales Qty'].shift(1)
rolling = grouped.rolling(window=w, min_periods=1).mean()
df_ma['MA' + str(w)] = rolling.reset_index(0, drop=True)
X_full = df_ma.loc[:, 'MA7':].values
#print(X_full.shape)
# use full data if not testing/tuning
rows_for_model = df_ma['Known Demand'].notnull()
X = df_ma.loc[rows_for_model, 'MA7':].values
y = df_ma.loc[rows_for_model, 'Known Demand'].values
X_train, y_train = X, y
print(X_train.shape, y_train.shape)
if train_model:
# instantiate model components
imputer = Imputer(missing_values='NaN', strategy=impute_with, axis=0)
scale = StandardScaler()
pca = PCA()
forest = RandomForestRegressor(n_estimators=n_estimators,
max_features='sqrt',
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_depth=max_depth,
criterion='mse',
random_state=42,
warm_start=True,
n_jobs=n_jobs)
# pipeline for model
pipeline_steps = [('imputer', imputer),
('scale', scale),
('pca', pca),
('forest', forest)]
regr = Pipeline(pipeline_steps)
regr.fit(X_train, y_train)
It fails here
Snippet Of data:
biqQueryData (the entire Dataframe)
364174,1084,2019-12-12,,,,0.0
.....
364174,1084,2019-12-13,,,,0.0
188880,397752,19421,2020-02-04,2.0,1.0,1.0,0.0
.....
188881,397752,19421,2020-02-05,2.0,1.0,1.0,0.0
Subset DF 1:
364174,1084,2019-12-12,,,,0.0
.....
364174,1084,2019-12-13,,,,0.0
Subset DF 2:
188880,397752,19421,2020-02-04,2.0,1.0,1.0,0.0
.....
188881,397752,19421,2020-02-05,2.0,1.0,1.0,0.0
Any help here would be great! Thank you

Problems with mapping user ids back to their respective cluster class in pandas

I did clustering and now want to map the cluster_class to each 'userid' row in my original dataframe. However, my past part of the code for "mapping" does NOT return the dataframe which I am expecting.
df=
userid recency frequency
1233 33232.0 5.715858
3344 23403.0 3.615858
#convert df to array
data=data.values
X=data
#Scale
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.25, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
#get dataframe with cluster_class and its data size
df=pd.DataFrame(pd.Series(labels).value_counts())
df.index.names = ['Cluster_Class']
df.rename(columns={ df.columns[0]: "Users" }, inplace = True)
df=
Users Cluster_Class
0 2096
-1 30
2 13
1 11
#MAP each cluster class to all userids. NOT WORKING!!!
N_CLUSTERS = len(df.index.names)-1
clusters = [X[db == i] for i in range(N_CLUSTERS)]
for i, c in enumerate(clusters):
print('Cluster {} has {} members: {}...'.format(i, len(c), c[0]))
You need to compare the labels.
But you compare the clustering object of type DBSCAN to an integer.

Resources