'Doc2Vec' object has no attribute 'syn0' - doc2vec

import gensim
from gensim.models.doc2vec import TaggedDocument
taggeddocs = []
tag2tweetmap = {}
for index,i in enumerate(cleaned_tweets):
if len(i) > 2: # Non empty tweets
tag = u'SENT_{:d}'.format(index)
sentence = TaggedDocument(words=gensim.utils.to_unicode(i).split(), tags=[tag])
tag2tweetmap[tag] = i
taggeddocs.append(sentence)
model = gensim.models.Doc2Vec(taggeddocs, dm=0, alpha=0.025, size=20, min_alpha=0.025, min_count=0)
for epoch in range(60):
if epoch % 20 == 0:
print('Now training epoch %s' % epoch)
model.train(taggeddocs,total_examples=model.corpus_count,epochs=model.iter)
model.alpha -= 0.002
model.min_alpha = model.alpha
from sklearn.cluster import KMeans
dataSet = model.syn0
kmeansClustering = KMeans(n_clusters=6)
centroidIndx = kmeansClustering.fit_predict(dataSet)
topic2wordsmap = {}
for i, val in enumerate(dataSet):
tag = model.docvecs.index_to_doctag(i)
topic = centroidIndx[i]
if topic in topic2wordsmap.keys():
for w in (tag2tweetmap[tag].split()):
topic2wordsmap[topic].append(w)
else:
topic2wordsmap[topic] = []
for i in topic2wordsmap:
words = topic2wordsmap[i]
print("Topic {} has words {}".format(i, words[:5]))
So I was trying to find out the most commonly used words and list of topics using doc2vec method.
it's the attribute error, say that "Doc2Vec has no attribute syn0", I don't know what to do about it.

I found this doc2vec tutorial may be give you some clue about your problem.
https://medium.com/#mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

Related

Jupyter Kernel dies/Spyder console stops while training custom NER model in Spacy 2.0.11

I was trying to train a custom NER model in spacy. Initially I had installed the latest spacy version but was getting the following error during the training
ValueError: [E103] Trying to set conflicting doc.ents: A token can only be part of one entity, so make sure the entities you're setting don't overlap.
After that I installed spacy version spacy==2.0.11 and tried running my code. When I am having around 10 rows of data to train, the model is working fine and it's saving to my output directory. But when there is more data(5K rows) which is the original training data, my jupyter kernel dies or when I run in spyder, the console just exists!!
I understand that the deprecated version of spacy is not throwing the value error but still it's of no use as I am unable to train my model.
Sample data:
CarryBag 09038820815c.txt
Stopperneedle 0903882080f4.txt
Foilbags 09038820819.txt
I have around 700 files like this with data to be tagged and in each file multiple entities need tagging.
Code for reference:
import spacy
# import en_core_web_sm
import re
import csv
from spacy.matcher import PhraseMatcher
import plac
from pathlib import Path
import random
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'D:/Docs/'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open(r'D:\ner_dummy_pack.csv', newline='', encoding ='utf-8') as myFile:
reader = csv.reader(myFile)
for row in reader:
try:
product = row[0].lower()
#print('K---'+ product)
filename = row[1]
file = open(DIR+filename, "r", encoding ='utf-8')
print(file)
filecontents = file.read()
for s in filecontents:
filecontents = re.sub(r'\s+', ' ', filecontents)
filecontents = re.sub(r'^https?:\/\/.*[\r\n]*', '', filecontents, flags=re.MULTILINE)
filecontents = re.sub(r"http\S+", "", filecontents)
filecontents = re.sub(r"[-\"#/#;:<>?{}*`• ?+=~|$.!‘?“”?,_]", " ", filecontents)
filecontents = re.sub(r'\d+', '', filecontents)#removing all numbers
filecontents = re.sub(' +', ' ',filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
if "," in product:
product_patterns = product.split(',')
product_patterns = [i.strip() for i in product_patterns]
for elem in product_patterns:
matcher.add('PRODUCT', None, nlp(elem))
else:
matcher.add('PRODUCT', None, nlp(product))
print(filecontents)
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
except Exception as e:
print(e)
pass
to_train_entsfinal=to_train_ents
def main(model=None, output_dir=None, n_iter=100):
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.50,
losses=losses)
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "C:\\Users\\APRIL"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()
Can anyone help me solve this. Even when I removed conflicting entities in a sample of 10+ rows, example:
Blister abc.txt
Blisterpack abc.txt
Blisters abc.txt
the same issue is happening and the model is not training
Suggested changes:
def main(model=None, output_dir=None, n_iter=100):
top_memory_precentage_use = 75 # or what ever number you choose
def handle_memory(ruler):
if psutil.virtual_memory().percent < top_memory_precentage_use:
dump_ruler_nonascii(ruler)
ruler = nlp.begin_training() #or just init the nlp object again
return ruler
# This fitted for my use case
def dump_ruler_nonascii(ruler):
path = Path(os.path.join(self.data_path, 'config.jsonl'))
pattern = ruler.patterns
with open(path, "a", encoding="utf-8") as f:
for line in pattern:
f.write(json.dumps(line, ensure_ascii=False) + "\n")
return ruler
# nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(10):
losses = {}
random.shuffle(to_train_entsfinal)
for item in to_train_entsfinal:
nlp.update([item[0]],
[item[1]],
sgd=optimizer,
drop=0.50,
losses=losses)
print(losses)
print("OUTTTTT")
if output_dir is None:
output_dir = "C:\\Users\\APRIL"
noutput_dir = Path(output_dir)
if not noutput_dir.exists():
noutput_dir.mkdir()
#nlp.meta['name'] = new_model_name
nlp.to_disk(output_dir)
random.shuffle(to_train_entsfinal)
if __name__ == '__main__':
main()
It is hard to tell you why it is happening, but I can supply you 2 helper functions your training loop. that you can adjust to your use. In my case it was writing patterns and I checked the memory use every iteration.
#add the following imports
import psutil
import os
top_memory_precentage_use = 75 # or what ever number you choose
def handle_memory(ruler):
if psutil.virtual_memory().percent < top_memory_precentage_use:
dump_ruler_nonascii(ruler)
ruler = nlp.begin_training() #or just init the nlp object again
return ruler
# This fitted for my use case
def dump_ruler_nonascii(ruler):
path = Path(os.path.join(self.data_path, 'config.jsonl'))
pattern = ruler.patterns
with open(path, "a", encoding="utf-8") as f:
for line in pattern:
f.write(json.dumps(line, ensure_ascii=False) + "\n")

What is the math behind TfidfVectorizer?

I am trying to understand the math behind the TfidfVectorizer. I used this tutorial, but my code is a little bit changed:
what also says at the end that The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations.
I want to be able to use TfidfVectorizer but also calculate the same simple sample by my hand.
Here is my whole code:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def main():
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(pd.DataFrame([tfA, tfB]))
CV = CountVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b')
cv_ft = CV.fit_transform(corpus)
tt = TfidfTransformer(use_idf=False, norm='l1')
t = tt.fit_transform(cv_ft)
print(pd.DataFrame(t.todense().tolist(), columns=CV.get_feature_names()))
print('----------- compare idf -------------------')
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(pd.DataFrame([idfs]))
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
print(pd.DataFrame([tfidfA, tfidfB]))
ttf = TfidfTransformer(use_idf=True, smooth_idf=False, norm=None)
f = ttf.fit_transform(cv_ft)
print(pd.DataFrame(f.todense().tolist(), columns=CV.get_feature_names()))
print('----------- TfidfVectorizer -------------------')
vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True, stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
print(pd.DataFrame(vectors.todense().tolist(), columns=feature_names))
def computeTF(wordDict, bagOfWords):
tfDict = {}
bagOfWordsCount = len(bagOfWords)
for word, count in wordDict.items():
tfDict[word] = count / float(bagOfWordsCount)
return tfDict
def computeIDF(documents):
import math
N = len(documents)
idfDict = dict.fromkeys(documents[0].keys(), 0)
for document in documents:
for word, val in document.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log(N / float(val))
return idfDict
def computeTFIDF(tfBagOfWords, idfs):
tfidf = {}
for word, val in tfBagOfWords.items():
tfidf[word] = val * idfs[word]
return tfidf
if __name__ == "__main__":
main()
I can compare calculation of Term Frequency. Both results look the same. But when I calculate the IDF and then TF-IDF there are differences between the code from the website and TfidfVectorizer (I also try combination of CountVectorizer and TfidfTransformer to be sure it returns the same results like TfidfVectorizer does).
Code Tf-Idf results:
TfidfVectorizer Tf-Idf results:
Can anybody help me with a code that would return the same returns as TfidfVectorizer or setting of TfidfVectorizer what would return the same results as the code above?
Here is my improvisation of your code to reproduce TfidfVectorizer output for your data .
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from IPython.display import display
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
series_A = pd.Series(numOfWordsA)
series_B = pd.Series(numOfWordsB)
df = pd.concat([series_A, series_B], axis=1).T
df = df.reindex(sorted(df.columns), axis=1)
display(df)
tf_df = df.divide(df.sum(1),axis='index')
n_d = 1+ tf_df.shape[0]
df_d_t = 1 + (tf_df.values>0).sum(0)
idf = np.log(n_d/df_d_t) + 1
pd.DataFrame(df.values * idf,
columns=df.columns )
tfidf = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
pd.DataFrame(tfidf.fit_transform(corpus).todense(),
columns=tfidf.get_feature_names() )
More details on the implementation refer the documentation here.

How to compare one document to all other in a dataset using spacy document similarity function?

I have a requirement to compare a document with all others in the dataset and get the similarity score. I am using spacy's similarity function to do this. As the no.of documents in the dataset are 10^6 the brute-force approach using 2 for loops is taking a very long time? Is there any direct method to achieve this? Any help would be highly appreciated
import uuid
import time
start_time=time.time()
counter = 1
similar_desc_uuid_dict_o=dict()
for doc1 in descs[:2]:
uniqueid=str(uuid.uuid4())
if counter % 1 == 0:
print("Processed %d out of %d documents." % (counter, len(descs)))
counter+=1
for doc2 in descs:
if(doc1.similarity(doc2)>=0.89):
current_value=similar_desc_uuid_dict_o.get(str(doc2))
if(current_value==None):
similar_desc_uuid_dict_o[str(doc2)]=uniqueid
else:
updated_value=current_value+" "+uniqueid
similar_desc_uuid_dict_o[str(doc2)]=updated_value
print('Done. Time elapsed: {:.2f}mins'.format((time.time() - start_time)/60))
similar_desc_uuid_dict_o
I found an alternative solution to perform the above task at scale using a gensim.
here is my working code:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
import sys
import time, traceback
def cossim(documents, query_docs=None, task='pairwise_similarity', metric_threshold=0.85, num_best=20, **kwargs):
try:
dictionary = Dictionary(documents)
tfidf = TfidfModel(dictionary=dictionary)
corpus = [dictionary.doc2bow(doc) for doc in documents]
features_rep='bow'
if len(dictionary) > 1000 and len(dictionary) <=2000:
corpus = [tfidf[doc] for doc in corpus]
features_rep='tfidf'
elif len(dictionary) > 2000:
model = LsiModel(corpus, id2word=dictionary, num_topics=200)
corpus = [model[tfidf[doc]] for doc in corpus]
features_rep = 'lsi'
index_tmpfile = get_tmpfile("index")
index = Similarity(output_prefix=index_tmpfile, corpus=corpus, num_best=num_best, num_features=len(dictionary),
chunksize=256)
similarities = []
if task == 'pairwise_similarity':
start_time = time.time()
for sim in index:
similarities.append(sim)
elif task == 'batch_query':
start_time = time.time()
query_docs_features = [dictionary.doc2bow(doc) for doc in query_docs]
if features_rep=='tfidf':
query_docs_features = [tfidf[doc] for doc in query_docs_features]
elif features_rep=='lsi':
query_docs_features = [model[tfidf[doc]] for doc in query_docs_features]
for sim in index[query_docs_features]:
similarities.append(sim)
filtered_results = []
for ind_sim in similarities:
filtered_results.append([item[0] for item in ind_sim if item[1] >= metric_threshold])
if query_docs is not None:
matched_docs, unmatched_docs, matching_stats = stats(documents, query_docs, filtered_results)
return matched_docs, unmatched_docs
else:
return filtered_results
except Exception:
logging.error(
"Exception has occurred while performing Cosine Similarity. {}".format(traceback.format_exc()))

How to get frequencies of topics of NMF in sklearn

I am now using NMF to generate topics. My code is shown below. However, I do not know how to get the frequency of each topic. Does anyone that can help me? Thank you!
def fit_tfidf(documents):
tfidf = TfidfVectorizer(input = 'content', stop_words = 'english',
use_idf = True, ngram_range = NGRAM_RANGE,lowercase = True, max_features = MAX_FEATURES, min_df = 1 )
tfidf_matrix = tfidf.fit_transform(documents.values).toarray()
tfidf_feature_names = np.array(tfidf.get_feature_names())
tfidf_reverse_lookup = {word: idx for idx, word in enumerate(tfidf_feature_names)}
return tfidf_matrix, tfidf_reverse_lookup, tfidf_feature_names
def vectorization(documments):
if VECTORIZER == 'tfidf':
vec_matrix, vec_reverse_lookup, vec_feature_names = fit_tfidf(documents)
if VECTORIZER == 'bow':
vec_matrix, vec_reverse_lookup, vec_feature_names = fit_bow(documents)
return vec_matrix, vec_reverse_lookup, vec_feature_names
def nmf_model(vec_matrix, vec_reverse_lookup, vec_feature_names, NUM_TOPICS):
topic_words = []
nmf = NMF(n_components = NUM_TOPICS, random_state=3).fit(vec_matrix)
for topic in nmf.components_:
word_idx = np.argsort(topic)[::-1][0:N_TOPIC_WORDS]
topic_words.append([vec_feature_names[i] for i in word_idx])
return topic_words
If you mean the frequency of each topic inside each documents, then:
H = nmf.fit_transform(vec_matrix)
H is a matrix of shape (n_documents, n_topics). Each row represents a document vector (in the topic space). In this vector you find the weight that each topic has (which translates as the topic importance).

Why won't metrics for NB classifier print? (nltk.metrics.scores)

I am trying to print out the metrics for my naïve bayes classifier model but the code continues to return 'none' for all of the print lines. I use the following code to print my metrics but can't figure why it return the metric values I need, any help is appreciated!
import collections
from nltk.metrics.scores import (precision, recall, f_measure)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(train_set):
refsets[label].add(i)
observed = nb_classifier.classify(feats)
testsets[observed].add(i)
print('pos precision:', precision(refsets['pos'], testsets['pos']))
print('pos recall:', recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', precision(refsets['neg'], testsets['neg']))
print('neg recall:', recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
TL;DR
import random
from collections import Counter
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.metrics.scores import precision, recall, f_measure
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = Counter(all_words)
def find_features(document, top_n=3000):
word_features = list(all_words.keys())[:top_n]
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
def train_test_split(documents, random_seed=0, split_on=0.95, top_n=3000):
custom_random = random.Random(random_seed)
custom_random.shuffle(documents)
featuresets = [(find_features(rev, top_n), category) for (rev, category) in documents]
split_on_int = int(len(featuresets) * split_on)
training_set = featuresets[:split_on_int]
testing_set = featuresets[split_on_int:]
return training_set, testing_set
training_set, testing_set = train_test_split(documents)
The actual classifier training and evaluation:
nb = NaiveBayesClassifier.train(training_set)
predictions, gold_labels = defaultdict(set), defaultdict(set)
for i, (features, label) in enumerate(testing_set):
predictions[nb.classify(features)].add(i)
gold_labels[label].add(i)
for label in predictions:
print(label, 'Precision:', precision(gold_labels[label], predictions[label]))
print(label, 'Recall:', recall(gold_labels[label], predictions[label]))
print(label, 'F1-Score:', f_measure(gold_labels[label], predictions[label]))
print()
[out]:
neg Precision: 0.803921568627451
neg Recall: 0.9534883720930233
neg F1-Score: 0.8723404255319148
pos Precision: 0.9591836734693877
pos Recall: 0.8245614035087719
pos F1-Score: 0.8867924528301887

Resources