Converting TFRECORD file to text data - python-3.x

I have converted a .txt file to tfrecords with some changes to it. But now I want to convert or read same file so I could understand my data which is now changed. I am doing this for my knowledge graph project.
import numpy as np
import os
import tensorflow as tf
import tqdm
import pdb
import glob
import time
import sys
import re
import argparse
import fastBPE
import platform
use_py3 = platform.python_version()[0] == '3'
parser = argparse.ArgumentParser(description='TensorFlow code for creating TFRecords data')
parser.add_argument('--text_file', type=str, required=True,
help='location of text file to convert to TFRecords')
parser.add_argument('--control_code', type=str, required=True,
help='control code to use for this file. must be in the vocabulary, else it will error out.')
parser.add_argument('--sequence_len', type=int, required=True,
help='sequence length of model being fine-tuned (256 or 512)')
args = parser.parse_args()
path_to_train_file = fname = args.text_file
domain = [args.control_code]
train_text = open(path_to_train_file, 'rb').read().decode(encoding='utf-8')
bpe = fastBPE.fastBPE('../codes', '../vocab')
tokenized_train_text = bpe.apply([train_text.encode('ascii', errors='ignore') if not use_py3 else train_text])[0] # will NOT work for non-English texts
# if you want to run non-english text, please tokenize separately using ./fast applybpe and then run this script on the .bpe file with utf8 encoding
tokenized_train_text = re.findall(r'\S+|\n', tokenized_train_text)
tokenized_train_text = list(filter(lambda x: x != u'##', tokenized_train_text))
# load the vocabulary from file
vocab = open('../vocab').read().decode(encoding='utf-8').split('\n') if not use_py3 else open('../vocab', encoding='utf-8').read().split('\n')
vocab = list(map(lambda x: x.split(' ')[0], vocab)) + ['<unk>'] + ['\n']
print ('{} unique words'.format(len(vocab)))
if args.control_code not in vocab:
print('Provided control code is not in the vocabulary')
print('Please provide a different one; refer to the vocab file for allowable tokens')
sys.exit(1)
# Creating a mapping from unique characters to indices
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)
seq_length = args.sequence_len-1
def numericalize(x):
count = 0
for i in x:
if i not in word2idx:
print(i)
count += 1
return count>1, [word2idx.get(i, word2idx['<unk>']) for i in x]
tfrecords_fname = fname.lower()+'.tfrecords'
total = 0
skipped = 0
with tf.io.TFRecordWriter(tfrecords_fname) as writer:
for i in tqdm.tqdm(range(0, len(tokenized_train_text), seq_length)):
flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length])
flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1])
total += 1
if flag_input or flag_output:
skipped += 1
continue
if len(inputs)!=seq_length+1 or len(outputs)!=seq_length+1:
break
example_proto = tf.train.Example(features=tf.train.Features(feature={'input': tf.train.Feature(int64_list=tf.train.Int64List(value=inputs)),
'output': tf.train.Feature(int64_list=tf.train.Int64List(value=outputs))}))
writer.write(example_proto.SerializeToString())
print('Done')
print('Skipped', skipped, 'of', total)
This is my code I want every changes in it except that to convert in tfrecords.

Read the TFRecord with a TFRecordDataset.
Then iterate through the TFRecordDataset and for each element, write to a new text file or print out the results.
https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset

Related

How to iterate on keras Dataset and edit content

I am working on this movie classification problem
https://www.tensorflow.org/tutorials/keras/text_classification
In this example text files(12500 files with movie revies) are read and a batched dataset is prepared like below
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
at the time of standardization
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
#I WANT TO REMOVE STOP WORDS HERE, CAN I DO
return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'')
Problem: I understand that I have got training dataset with labels in variable 'raw_train_ds'. Now I want to iterate over this dataset and remove stop words from the movie review text and store back to same variable, I tried to do it in function 'custom_standardization' but it gives type error,
I also tried to use tf.strings.as_strings but it returns error
InvalidArgumentError: Value for attr 'T' of string is not in the list of allowed values: int8, int16, int32, int64
can someone please help on it OR simply please help how to remove stopwords from the batch dataset
It looks like right now TensorFlow does not have built in support for stop words removal, just basic standardization (lowercase & punctuation stripping). The TextVectorization used in the tutorial supports a custom standardization callback, but I couldn't find any stop words examples.
Since the tutorial downloads the imdb dataset and reads the text files from disc you can just do standardization manually with python before reading them. This will modify the text files themselves, but then you can read in the files normally using tf.keras.preprocessing.text_dataset_from_directory, and the entries will already have the stop words removed.
#!/usr/bin/env python3
import pathlib
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def cleanup_text_files_in_folder(folder_name):
text_files = []
for file_path in pathlib.Path(folder_name).glob('*.txt'):
text_files.append(str(file_path))
print(f'Found {len(text_files)} files in {folder_name}')
# Give some kind of status
i = 0
for text_file in text_files:
replace_file_contents(text_file)
i += 1
if i % 1000 == 0:
print("No of files processed =", i)
return text_files
def replace_file_contents(input_file):
"""
This will read in the contents of the text file, process it (clean up, remove stop words)
and overwrite the new 'processed' output to that same file
"""
with open(input_file, 'r') as file:
file_data = file.read()
file_data = process_text_adv(file_data)
with open(input_file, 'w') as file:
file.write(file_data)
def process_text_adv(text):
# review without HTML tags
text = BeautifulSoup(text, features="html.parser").get_text()
# review without punctuation and numbers
text = re.sub(r'[^\w\s]','',text, re.UNICODE)
# lowercase
text = text.lower()
# simple split
text = text.split()
swords = set(stopwords.words("english")) # conversion into set for fast searching
text = [w for w in text if w not in swords]
# joining of splitted paragraph by spaces and return
return " ".join(text)
if __name__ == "__main__":
# Download & untar dataset beforehand, then running this would modify the text files
# in place. Back up the originals if that's a concern.
cleanup_text_files_in_folder('aclImdb/train/pos/')
cleanup_text_files_in_folder('aclImdb/train/neg/')
cleanup_text_files_in_folder('aclImdb/test/pos/')
cleanup_text_files_in_folder('aclImdb/test/neg/')

Jupyter Kernel dies/Spyder console stops while training custom NER model in Spacy 2.0.11

I was trying to train a custom NER model in spacy. Initially I had installed the latest spacy version but was getting the following error during the training
ValueError: [E103] Trying to set conflicting doc.ents: A token can only be part of one entity, so make sure the entities you're setting don't overlap.
After that I installed spacy version spacy==2.0.11 and tried running my code. When I am having around 10 rows of data to train, the model is working fine and it's saving to my output directory. But when there is more data(5K rows) which is the original training data, my jupyter kernel dies or when I run in spyder, the console just exists!!
I understand that the deprecated version of spacy is not throwing the value error but still it's of no use as I am unable to train my model.
Sample data:
CarryBag 09038820815c.txt
Stopperneedle 0903882080f4.txt
Foilbags 09038820819.txt
I have around 700 files like this with data to be tagged and in each file multiple entities need tagging.
Code for reference:
import spacy
# import en_core_web_sm
import re
import csv
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'D:/Docs/'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open(r'D:\ner_dummy_pack.csv', newline='', encoding ='utf-8') as myFile:
reader = csv.reader(myFile)
for row in reader:
try:
product = row[0].lower()
#print('K---'+ product)
filename = row[1]
file = open(DIR+filename, "r", encoding ='utf-8')
print(file)
filecontents = file.read()
for s in filecontents:
filecontents = re.sub(r'\s+', ' ', filecontents)
filecontents = re.sub(r'^https?:\/\/.*[\r\n]*', '', filecontents, flags=re.MULTILINE)
filecontents = re.sub(r"http\S+", "", filecontents)
filecontents = re.sub(r"[-\"#/#;:<>?{}*`• ?+=~|$.!‘?“”?,_]", " ", filecontents)
filecontents = re.sub(r'\d+', '', filecontents)#removing all numbers
filecontents = re.sub(' +', ' ',filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
if "," in product:
product_patterns = product.split(',')
product_patterns = [i.strip() for i in product_patterns]
for elem in product_patterns:
matcher.add('PRODUCT', None, nlp(elem))
else:
matcher.add('PRODUCT', None, nlp(product))
print(filecontents)
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
except Exception as e:
print(e)
pass
to_train_entsfinal=to_train_ents
def main(model=None, output_dir=None, n_iter=100):
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.50,
losses=losses)
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "C:\\Users\\APRIL"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()
Can anyone help me solve this. Even when I removed conflicting entities in a sample of 10+ rows, example:
Blister abc.txt
Blisterpack abc.txt
Blisters abc.txt
the same issue is happening and the model is not training
Suggested changes:
def main(model=None, output_dir=None, n_iter=100):
top_memory_precentage_use = 75 # or what ever number you choose
def handle_memory(ruler):
if psutil.virtual_memory().percent < top_memory_precentage_use:
dump_ruler_nonascii(ruler)
ruler = nlp.begin_training() #or just init the nlp object again
return ruler
# This fitted for my use case
def dump_ruler_nonascii(ruler):
path = Path(os.path.join(self.data_path, 'config.jsonl'))
pattern = ruler.patterns
with open(path, "a", encoding="utf-8") as f:
for line in pattern:
f.write(json.dumps(line, ensure_ascii=False) + "\n")
return ruler
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.50,
losses=losses)
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "C:\\Users\\APRIL"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()
It is hard to tell you why it is happening, but I can supply you 2 helper functions your training loop. that you can adjust to your use. In my case it was writing patterns and I checked the memory use every iteration.
#add the following imports
import psutil
import os
top_memory_precentage_use = 75 # or what ever number you choose
def handle_memory(ruler):
if psutil.virtual_memory().percent < top_memory_precentage_use:
dump_ruler_nonascii(ruler)
ruler = nlp.begin_training() #or just init the nlp object again
return ruler
# This fitted for my use case
def dump_ruler_nonascii(ruler):
path = Path(os.path.join(self.data_path, 'config.jsonl'))
pattern = ruler.patterns
with open(path, "a", encoding="utf-8") as f:
for line in pattern:
f.write(json.dumps(line, ensure_ascii=False) + "\n")

'no unique mode; found %d equally common values' % len(table) statistics.StatisticsError: no unique mode; found 2 equally common values

when i use large number of data show this Error:('no unique mode; found %d equally common values' % len(table) statistics.StatisticsError: no unique mode; found 2 equally common values). But use 100 number of data it's work.i can't understand what the reason it doesn't work any one help and how to solve this Error pls.
data link:https://github.com/YoeriNijs/TweetAnalyzer
code:
import warnings
warnings.filterwarnings("ignore")
import nltk, random, csv, sys
from nltk.probability import FreqDist, ELEProbDist
from nltk.classify.util import apply_features,accuracy
from nltk.corpus import names
from nltk.tokenize import word_tokenize
import nltk.classify.util
from nltk import NaiveBayesClassifier
from textblob import TextBlob
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def get_words_in_tweets(tweets):
all_words = []
try:
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
except Exception as e:
print(e)
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
#print (word_features)
return word_features
def selectTweets(row):
tweetWords = []
words = row[0].split()
for i in words:
i = i.lower()
i = i.strip('##\'"?,.!')
tweetWords.append(i)
row[0] = tweetWords
if counter <= 120:
trainTweets.append(row)
#print(trainTweets)
#print(('*')*30)
else:
testTweets.append(row)
#print(testTweets)
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
trainTweets = []
testTweets = []
#csvfile.csv
while True:
# Ask for filename
filename = str(input("> Please enter a filename (.csv): "))
#Check if filename ends with .csv
if filename.endswith(".csv"):
try:
#Open file
with open(filename, 'r',encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
#Print succes message
print ("> File opened successfully!")
counter = 0
for row in reader:
selectTweets(row)
counter += 1
print (counter,"> Wait a sec for the results...")
word_features = get_word_features(get_words_in_tweets(trainTweets))
training_set = apply_features(extract_features, trainTweets)
test_training_set=apply_features(extract_features, testTweets)
classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features(5)
print (nltk.classify.util.accuracy(classifier,test_training_set))
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, test_training_set))
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_training_set))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_training_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_training_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_training_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_training_set))*100)
voted_classifier = VoteClassifier(classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
while True:
tweet = str(input("Please enter the text of the tweet you want to analize: "))
print (classifier.classify(extract_features(tweet.split())))
while True:
print
repeat = str(input("> Do you want to check another tweet (y/n)? "))
if repeat == "n":
print ("Exit program")
sys.exit()
if repeat != "y":
print ("Something went wrong")
if repeat == "y":
break
#If file does not exist, display this"""
except IOError:
print ("File does not exist.")
#Else if file does not end with .csv, do this
else:
print ("Please open a file that ends with .csv")
Show this Error:
Traceback (most recent call last):
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 163, in <module>
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in classify_many
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in <listcomp>
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 35, in classify
return mode(votes)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\statistics.py", line 507, in mode
'no unique mode; found %d equally common values' % len(table)
statistics.StatisticsError: no unique mode; found 2 equally common values
The easiest way to solve this is to upgrade Python to 3.8 or higher.
In Python versions 3.7 and older there can to be only a single number that occurs the most times in the whole set. If a set contains two or more such numbers than mode becomes inconclusive and returns the exact error you got.
However, since version 3.8 the whole mathematical concept is changed. In cases in which there are two or more modes in a set, the smallest mode is selected as the result.
Example:
result = statistics.mode([1,1,2,2,3,3])
has three possible and equal solutions: 1, 2, or 3 as each number occurs two times in the set
in Python 3.7 this returns an error,
in Python 3.8 this returns 1 as the mode

Having issues computing the average of compound sentiment values for each text file in a folder

# below is the sentiment analysis code written for sentence-level analysis
import glob
import os
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize
# Next, VADER is initialized so I can use it within the Python script
sid = SentimentIntensityAnalyzer()
# I will also initialize the 'english.pickle' function and give it a short
name
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#Each of the text file is listed from the folder speeches
files = glob.glob(os.path.join(os.getcwd(), 'cnn_articles', '*.txt'))
text = []
#iterate over the list getting each file
for file in files:
#open the file and then call .read() to get the text
with open(file) as f:
text.append(f.read())
text_str = "\n".join(text)
# This breaks up the paragraph into a list of strings.
sentences = tokenizer.tokenize(text_str )
sent = 0.0
count = 0
# Iterating through the list of sentences and extracting the compound scores
for sentence in sentences:
count +=1
scores = sid.polarity_scores(sentence)
sent += scores['compound'] #Adding up the overall compound sentiment
# print(sent, file=open('cnn_compound.txt', 'a'))
if count != 0:
sent = float(sent / count)
print(sent, file=open('cnn_compound.txt', 'a'))
With these lines of code, I have been able to get the average of all the compound sentiment values for all the text files. What I really want is the
average compound sentiment value for each text file, such that if I have 10
text files in the folder, I will have 10 floating point values representing
each of the text file. So that I can plot these values against each other.
Kindly assist me as I am very new to Python.
# below is the sentiment analysis code written for sentence-level analysis
import os, string, glob, pandas as pd, numpy as np
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize
# Next, VADER is initialized so I can use it within the Python
script
sid = SentimentIntensityAnalyzer()
exclude = set(string.punctuation)
# I will also initialize the 'english.pickle' function and give
it a short
name
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#Each of the text file is listed from the folder speeches
files = glob.glob(os.path.join(os.getcwd(), 'cnn_articles',
'*.txt'))
text = []
sent = 0.0
count = 0
cnt = 0
#iterate over the list getting each file
for file in files:
f = open(file).read().split('.')
cnt +=1
count = (len(f))
for sentence in f:
if sentence not in exclude:
scores = sid.polarity_scores(sentence)
print(scores)
break
sent += scores['compound']
average = round((sent/count), 4)
t = [cnt, average]
text.append(t)
break
df = pd.DataFrame(text, columns=['Article Number', 'Average
Value'])
#
#df.to_csv(r'Result.txt', header=True, index=None, sep='"\t\"
+"\t\"', mode='w')
df.to_csv('cnn_result.csv', index=None)

document clustering in python

I am new to both python and scikit-learn, I am going to cluster bunch of text files ( body of NEWS) , I am using the following code :
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import nltk, sklearn, string, os
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
# Preprocessing text with NLTK package
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
###########################################################################
# Loading and preprocessing text data
print("\n Loading text dataset:")
path = 'n'
for subdir, dirs, files in (os.walk(path)):
for i,f in enumerate(files):
if f != '.DS_Store':
file_path = subdir + os.path.sep + f
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(string.punctuation)
token_dict[f] = no_punctuation
###########################################################################
true_k = 3 # *
print("\n Performing stemming and tokenization...")
vectorizer = TfidfVectorizer(tokenizer=tokenize, encoding='latin-1',
stop_words='english')
X = vectorizer.fit_transform(token_dict.values())
print("n_samples: %d, n_features: %d" % X.shape)
print()
###############################################################################
# Do the actual clustering
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
y=km.fit(X)
print(km)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
This code is getting the top words. But what document it is and how can I know which original text files belongs to cluster0, cluster1 or cluster2?
To explain a bit more--you can store the cluster allocations using the follow:
clusters = km.labels_.tolist()
This list will be ordered the same as the dict you passed to your vectorizer.
I just put together a guide to document clustering you might find helpful. Let me know if I can explain anything in more detail: http://brandonrose.org/clustering

Resources