What is the math behind TfidfVectorizer? - python-3.x

I am trying to understand the math behind the TfidfVectorizer. I used this tutorial, but my code is a little bit changed:
what also says at the end that The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations.
I want to be able to use TfidfVectorizer but also calculate the same simple sample by my hand.
Here is my whole code:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def main():
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(pd.DataFrame([tfA, tfB]))
CV = CountVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b')
cv_ft = CV.fit_transform(corpus)
tt = TfidfTransformer(use_idf=False, norm='l1')
t = tt.fit_transform(cv_ft)
print(pd.DataFrame(t.todense().tolist(), columns=CV.get_feature_names()))
print('----------- compare idf -------------------')
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(pd.DataFrame([idfs]))
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
print(pd.DataFrame([tfidfA, tfidfB]))
ttf = TfidfTransformer(use_idf=True, smooth_idf=False, norm=None)
f = ttf.fit_transform(cv_ft)
print(pd.DataFrame(f.todense().tolist(), columns=CV.get_feature_names()))
print('----------- TfidfVectorizer -------------------')
vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True, stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
print(pd.DataFrame(vectors.todense().tolist(), columns=feature_names))
def computeTF(wordDict, bagOfWords):
tfDict = {}
bagOfWordsCount = len(bagOfWords)
for word, count in wordDict.items():
tfDict[word] = count / float(bagOfWordsCount)
return tfDict
def computeIDF(documents):
import math
N = len(documents)
idfDict = dict.fromkeys(documents[0].keys(), 0)
for document in documents:
for word, val in document.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log(N / float(val))
return idfDict
def computeTFIDF(tfBagOfWords, idfs):
tfidf = {}
for word, val in tfBagOfWords.items():
tfidf[word] = val * idfs[word]
return tfidf
if __name__ == "__main__":
main()
I can compare calculation of Term Frequency. Both results look the same. But when I calculate the IDF and then TF-IDF there are differences between the code from the website and TfidfVectorizer (I also try combination of CountVectorizer and TfidfTransformer to be sure it returns the same results like TfidfVectorizer does).
Code Tf-Idf results:
TfidfVectorizer Tf-Idf results:
Can anybody help me with a code that would return the same returns as TfidfVectorizer or setting of TfidfVectorizer what would return the same results as the code above?

Here is my improvisation of your code to reproduce TfidfVectorizer output for your data .
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from IPython.display import display
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
series_A = pd.Series(numOfWordsA)
series_B = pd.Series(numOfWordsB)
df = pd.concat([series_A, series_B], axis=1).T
df = df.reindex(sorted(df.columns), axis=1)
display(df)
tf_df = df.divide(df.sum(1),axis='index')
n_d = 1+ tf_df.shape[0]
df_d_t = 1 + (tf_df.values>0).sum(0)
idf = np.log(n_d/df_d_t) + 1
pd.DataFrame(df.values * idf,
columns=df.columns )
tfidf = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
pd.DataFrame(tfidf.fit_transform(corpus).todense(),
columns=tfidf.get_feature_names() )
More details on the implementation refer the documentation here.

Related

How to do large-scale matrix-matrix multiplication in Spark

I have a dataframe in spark, that is a list of (user, itemm rating)
user item rating
ust001 ipx001 5
ust002 ipx04 2
ust001 itx001 4
ust002 iox04 5
If assume I have n users and m items. I can construct a matrix A with size nxm'
my goal is to save use this matrix to compute ite-item similarity: B = A^T * A, and save it as scipy sparse matrix B.npz
here is what I do in python
import numpy as np
import pandas as pd
import pickle
df = pd.read('user_item.paruet')
# mapping string to index
user2num = {}
item2num = {}
UID = 0
IID = 0
# remaping index to string
num2user = {}
num2ite ={}
# loop over all emelemt and map string to index
for i in range(len(df['user'])):
if df['user'][i] not in user2num:
user2num[df['user'][i]] = UID
num2ser[UID] = df['user'][i]
UID += 1
if df['item'][i] not in item2num:
item2num[df['item'][i]] = IID
num2item[IID] = df['item'][i]
IID += 1
# save the pair of string-index
with open('num2item.pickle', 'wb') as handle:
pickle.dump(num2item, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('item2num.pickle', 'wb') as handle:
pickle.dump(item2num, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('num2user.pickle', 'wb') as handle:
pickle.dump(num2user, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('user2num.pickle', 'wb') as handle:
pickle.dump(user2num, handle, protocol=pickle.HIGHEST_PROTOCOL)
df["user"] = df["user"].map(pan2num)
df["item"] = df["item"].map(mrch2num)
df.to_parquet('ID_user-item.parquet')
Then I have another script to compute matrix
# another file to compte item-item similarity
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from scipy import sparse
import scipy
df = pd.read_parquet('ID_user-item.parquet')
with open('num2item.pickle', 'rb') as handle:
item_id = pickle.load(handle)
with open('num2user.pickle', 'rb') as handle:
user_id = pickle.load(handle)
row = df['user'].values
col = df['item'].values
data = df['rating'].values
A = csr_matrix((data,(row, col)), shape=(len(user_id), len(item_id)))
B = csr_matrix((data,(col, row)), shape=(len(item_id), len(user_id)))
C = sparse.csr_matrix.dot(B, A)
scipy.sparse.save_npz('item-item.npz', C)
#based on num2item, I can remap the index to string to retrival the item-item similarity.
Above is okay for small dataset. However, If I have 500G user-item-rating. The python will be alway out of memory.
My question is:
How can I obtain this item-item.npz by using spark, using the same logic?
Above is

Goodness of fit always being zero despite taking random data?

I'm trying to write code that generates random data and computes goodness of fit but I'm not understanding why the chi-squared test is always zero, may I have a fix for this ? For an attempted fix I tried playing around with different types to see if I get any resulting changes in the initial output, also I've tried changing the parameters to the loop in question.
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a = random.randint(-10,10)
b = random.randint(-10,10)
y = a * (b + i)
observed_values.append(y)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = o_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for i in z_o_e_v_f:
t_s += [((o_v_f)-(e_v_f))]**2/(e_v_f) # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()
In your original code, e_v_f = o_v.astype(float) made o_v_f, e_v_f ending up the same. There was also some issue in the for loop. I have edited your code a bit. See what it does you are looking for:
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a_o = random.randint(-10,10)
b_o = random.randint(-10,10)
y_o = a_o * (b_o + i)
observed_values.append(y_o)
# a_e = random.randint(-10,10)
# b_e = random.randint(-10,10)
# y_e = a_e * (b_e + i)
expected_values.append(y_o + 5)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = e_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for o, e in z_o_e_v_f:
t_s += (o - e) **2 / e # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()

How to handle difference in MFCC feature for difference audio file

librosa.feature.mfcc returns difference dimensions for the different audio file. so how to handle this case for training or testing the model
#test.py
import os
import pickle
import numpy as np
from scipy.io.wavfile import read
import librosa as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
def get_MFCC(sr,audio):
features = mfcc.feature.mfcc(audio,sr,n_mfcc=20, dct_type=2)
feat = np.asarray(())
for i in range(features.shape[0]):
temp = features[i,:]
if np.isnan(np.min(temp)):
continue
else:
if feat.size == 0:
feat = temp
else:
feat = np.vstack((feat, temp))
features = feat;
features = preprocessing.scale(features)
return features
#path to test data
source = "C:\\Users\\PrashuGupta\\Downloads\\datasets\\pygender\\test_data\\AudioSet\\female_clips\\"
#path to save trained model
modelpath = "C:\\Users\\Prashu Gupta\\Downloads\\datasets\\pygender\\"
gmm_files = [os.path.join(modelpath,fname) for fname in
os.listdir(modelpath) if fname.endswith('.gmm')]
models = [pickle.load(open(fname,'rb')) for fname in gmm_files]
genders = [fname.split("\\")[-1].split(".gmm")[0] for fname
in gmm_files]
files = [os.path.join(source,f) for f in os.listdir(source)
if f.endswith(".wav")]
for f in files:
print (f.split("\\")[-1])
audio,sr = mfcc.load(f, sr = 16000,mono = True)
features = get_MFCC(sr,audio)
scores = None
log_likelihood = np.zeros(len(models))
for i in range(len(models)):
gmm = models[i] #checking with each model one by one
scores = np.array(gmm.score(features))
log_likelihood[i] = scores.sum()
winner = np.argmax(log_likelihood)
print ("\tdetected as - ", genders[winner],"\n\tscores:female",log_likelihood[0],",male ", log_likelihood[1],"\n")
The error
Expected the input data X have 1800 features, but got 313 features in
scores = np.array(gmm.score(features))
Either you must truncate/pad files such that they are all the same size (say 5 seconds), or summarize the features for the file into a fixed length vector that does not depend on clip length (average/min/max), or you make the classifier operate on a stream of fixed-lenght feature windows (say 1 second).

Why won't metrics for NB classifier print? (nltk.metrics.scores)

I am trying to print out the metrics for my naïve bayes classifier model but the code continues to return 'none' for all of the print lines. I use the following code to print my metrics but can't figure why it return the metric values I need, any help is appreciated!
import collections
from nltk.metrics.scores import (precision, recall, f_measure)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(train_set):
refsets[label].add(i)
observed = nb_classifier.classify(feats)
testsets[observed].add(i)
print('pos precision:', precision(refsets['pos'], testsets['pos']))
print('pos recall:', recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', precision(refsets['neg'], testsets['neg']))
print('neg recall:', recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))
TL;DR
import random
from collections import Counter
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.metrics.scores import precision, recall, f_measure
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = Counter(all_words)
def find_features(document, top_n=3000):
word_features = list(all_words.keys())[:top_n]
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
def train_test_split(documents, random_seed=0, split_on=0.95, top_n=3000):
custom_random = random.Random(random_seed)
custom_random.shuffle(documents)
featuresets = [(find_features(rev, top_n), category) for (rev, category) in documents]
split_on_int = int(len(featuresets) * split_on)
training_set = featuresets[:split_on_int]
testing_set = featuresets[split_on_int:]
return training_set, testing_set
training_set, testing_set = train_test_split(documents)
The actual classifier training and evaluation:
nb = NaiveBayesClassifier.train(training_set)
predictions, gold_labels = defaultdict(set), defaultdict(set)
for i, (features, label) in enumerate(testing_set):
predictions[nb.classify(features)].add(i)
gold_labels[label].add(i)
for label in predictions:
print(label, 'Precision:', precision(gold_labels[label], predictions[label]))
print(label, 'Recall:', recall(gold_labels[label], predictions[label]))
print(label, 'F1-Score:', f_measure(gold_labels[label], predictions[label]))
print()
[out]:
neg Precision: 0.803921568627451
neg Recall: 0.9534883720930233
neg F1-Score: 0.8723404255319148
pos Precision: 0.9591836734693877
pos Recall: 0.8245614035087719
pos F1-Score: 0.8867924528301887

document clustering in python

I am new to both python and scikit-learn, I am going to cluster bunch of text files ( body of NEWS) , I am using the following code :
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import nltk, sklearn, string, os
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
# Preprocessing text with NLTK package
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
###########################################################################
# Loading and preprocessing text data
print("\n Loading text dataset:")
path = 'n'
for subdir, dirs, files in (os.walk(path)):
for i,f in enumerate(files):
if f != '.DS_Store':
file_path = subdir + os.path.sep + f
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(string.punctuation)
token_dict[f] = no_punctuation
###########################################################################
true_k = 3 # *
print("\n Performing stemming and tokenization...")
vectorizer = TfidfVectorizer(tokenizer=tokenize, encoding='latin-1',
stop_words='english')
X = vectorizer.fit_transform(token_dict.values())
print("n_samples: %d, n_features: %d" % X.shape)
print()
###############################################################################
# Do the actual clustering
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
y=km.fit(X)
print(km)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
This code is getting the top words. But what document it is and how can I know which original text files belongs to cluster0, cluster1 or cluster2?
To explain a bit more--you can store the cluster allocations using the follow:
clusters = km.labels_.tolist()
This list will be ordered the same as the dict you passed to your vectorizer.
I just put together a guide to document clustering you might find helpful. Let me know if I can explain anything in more detail: http://brandonrose.org/clustering

Resources