How to handle difference in MFCC feature for difference audio file - python-3.x

librosa.feature.mfcc returns difference dimensions for the different audio file. so how to handle this case for training or testing the model
#test.py
import os
import pickle
import numpy as np
from scipy.io.wavfile import read
import librosa as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
def get_MFCC(sr,audio):
features = mfcc.feature.mfcc(audio,sr,n_mfcc=20, dct_type=2)
feat = np.asarray(())
for i in range(features.shape[0]):
temp = features[i,:]
if np.isnan(np.min(temp)):
continue
else:
if feat.size == 0:
feat = temp
else:
feat = np.vstack((feat, temp))
features = feat;
features = preprocessing.scale(features)
return features
#path to test data
source = "C:\\Users\\PrashuGupta\\Downloads\\datasets\\pygender\\test_data\\AudioSet\\female_clips\\"
#path to save trained model
modelpath = "C:\\Users\\Prashu Gupta\\Downloads\\datasets\\pygender\\"
gmm_files = [os.path.join(modelpath,fname) for fname in
os.listdir(modelpath) if fname.endswith('.gmm')]
models = [pickle.load(open(fname,'rb')) for fname in gmm_files]
genders = [fname.split("\\")[-1].split(".gmm")[0] for fname
in gmm_files]
files = [os.path.join(source,f) for f in os.listdir(source)
if f.endswith(".wav")]
for f in files:
print (f.split("\\")[-1])
audio,sr = mfcc.load(f, sr = 16000,mono = True)
features = get_MFCC(sr,audio)
scores = None
log_likelihood = np.zeros(len(models))
for i in range(len(models)):
gmm = models[i] #checking with each model one by one
scores = np.array(gmm.score(features))
log_likelihood[i] = scores.sum()
winner = np.argmax(log_likelihood)
print ("\tdetected as - ", genders[winner],"\n\tscores:female",log_likelihood[0],",male ", log_likelihood[1],"\n")
The error
Expected the input data X have 1800 features, but got 313 features in
scores = np.array(gmm.score(features))

Either you must truncate/pad files such that they are all the same size (say 5 seconds), or summarize the features for the file into a fixed length vector that does not depend on clip length (average/min/max), or you make the classifier operate on a stream of fixed-lenght feature windows (say 1 second).

Related

Converting TFRECORD file to text data

I have converted a .txt file to tfrecords with some changes to it. But now I want to convert or read same file so I could understand my data which is now changed. I am doing this for my knowledge graph project.
import numpy as np
import os
import tensorflow as tf
import tqdm
import pdb
import glob
import time
import sys
import re
import argparse
import fastBPE
import platform
use_py3 = platform.python_version()[0] == '3'
parser = argparse.ArgumentParser(description='TensorFlow code for creating TFRecords data')
parser.add_argument('--text_file', type=str, required=True,
help='location of text file to convert to TFRecords')
parser.add_argument('--control_code', type=str, required=True,
help='control code to use for this file. must be in the vocabulary, else it will error out.')
parser.add_argument('--sequence_len', type=int, required=True,
help='sequence length of model being fine-tuned (256 or 512)')
args = parser.parse_args()
path_to_train_file = fname = args.text_file
domain = [args.control_code]
train_text = open(path_to_train_file, 'rb').read().decode(encoding='utf-8')
bpe = fastBPE.fastBPE('../codes', '../vocab')
tokenized_train_text = bpe.apply([train_text.encode('ascii', errors='ignore') if not use_py3 else train_text])[0] # will NOT work for non-English texts
# if you want to run non-english text, please tokenize separately using ./fast applybpe and then run this script on the .bpe file with utf8 encoding
tokenized_train_text = re.findall(r'\S+|\n', tokenized_train_text)
tokenized_train_text = list(filter(lambda x: x != u'##', tokenized_train_text))
# load the vocabulary from file
vocab = open('../vocab').read().decode(encoding='utf-8').split('\n') if not use_py3 else open('../vocab', encoding='utf-8').read().split('\n')
vocab = list(map(lambda x: x.split(' ')[0], vocab)) + ['<unk>'] + ['\n']
print ('{} unique words'.format(len(vocab)))
if args.control_code not in vocab:
print('Provided control code is not in the vocabulary')
print('Please provide a different one; refer to the vocab file for allowable tokens')
sys.exit(1)
# Creating a mapping from unique characters to indices
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)
seq_length = args.sequence_len-1
def numericalize(x):
count = 0
for i in x:
if i not in word2idx:
print(i)
count += 1
return count>1, [word2idx.get(i, word2idx['<unk>']) for i in x]
tfrecords_fname = fname.lower()+'.tfrecords'
total = 0
skipped = 0
with tf.io.TFRecordWriter(tfrecords_fname) as writer:
for i in tqdm.tqdm(range(0, len(tokenized_train_text), seq_length)):
flag_input, inputs = numericalize(domain+tokenized_train_text[i:i+seq_length])
flag_output, outputs = numericalize(tokenized_train_text[i:i+seq_length+1])
total += 1
if flag_input or flag_output:
skipped += 1
continue
if len(inputs)!=seq_length+1 or len(outputs)!=seq_length+1:
break
example_proto = tf.train.Example(features=tf.train.Features(feature={'input': tf.train.Feature(int64_list=tf.train.Int64List(value=inputs)),
'output': tf.train.Feature(int64_list=tf.train.Int64List(value=outputs))}))
writer.write(example_proto.SerializeToString())
print('Done')
print('Skipped', skipped, 'of', total)
This is my code I want every changes in it except that to convert in tfrecords.
Read the TFRecord with a TFRecordDataset.
Then iterate through the TFRecordDataset and for each element, write to a new text file or print out the results.
https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset

Python 3 Multiprocessing and openCV problem with dictionary sharing between processor

I would like to use multiprocessing to compute the SIFT extraction and SIFT matching for object detection.
For now, I have a problem with the return value of the function that does not insert data in the dictionary.
I'm using Manager class and image that are open inside the function. But does not work.
Finally, my idea is:
Computer the keypoint for every reference image, use this keypoint as a parameter of a second function that compares and match with the keypoint and descriptors of the test image.
My code is:
# %% Import Section
import cv2
import numpy as np
from matplotlib import pyplot as plt
import os
from datetime import datetime
from multiprocessing import Process, cpu_count, Manager, Lock
import argparse
# %% path section
tests_path = 'TestImages/'
references_path = 'ReferenceImages2/'
result_path = 'ResultParametrizer/'
#%% Number of processor
cpus = cpu_count()
# %% parameter section
eps = 1e-7
useTwo = False # using the m and n keypoint better with False
# good point parameters
distanca_coefficient = 0.75
# gms parameter
gms_thresholdFactor = 3
gms_withRotation = True
gms_withScale = True
# flann parameter
flann_trees = 5
flann_checks = 50
#%% Locker
lock = Lock()
# %% function definition
def keypointToDictionaries(keypoint):
x, y = keypoint.pt
pt = float(x), float(y)
angle = float(keypoint.angle) if keypoint.angle is not None else None
size = float(keypoint.size) if keypoint.size is not None else None
response = float(keypoint.response) if keypoint.response is not None else None
class_id = int(keypoint.class_id) if keypoint.class_id is not None else None
octave = int(keypoint.octave) if keypoint.octave is not None else None
return {
'point': pt,
'angle': angle,
'size': size,
'response': response,
'class_id': class_id,
'octave': octave
}
def dictionariesToKeypoint(dictionary):
kp = cv2.KeyPoint()
kp.pt = dictionary['pt']
kp.angle = dictionary['angle']
kp.size = dictionary['size']
kp.response = dictionary['response']
kp.octave = dictionary['octave']
kp.class_id = dictionary['class_id']
return kp
def rootSIFT(dictionary, image_name, image_path,eps=eps):
# SIFT init
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
sift = cv2.xfeatures2d.SIFT_create()
keypoints, descriptors = sift.detectAndCompute(image, None)
descriptors /= (descriptors.sum(axis=1, keepdims=True) + eps)
descriptors = np.sqrt(descriptors)
print('Finito di calcolare, PID: ', os.getpid())
lock.acquire()
dictionary[image_name]['keypoints'] = keypoints
dictionary[image_name]['descriptors'] = descriptors
lock.release()
def featureMatching(reference_image, reference_descriptors, reference_keypoints, test_image, test_descriptors,
test_keypoints, flann_trees=flann_trees, flann_checks=flann_checks):
# FLANN parameter
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=flann_trees)
search_params = dict(checks=flann_checks) # or pass empty dictionary
flann = cv2.FlannBasedMatcher(index_params, search_params)
flann_matches = flann.knnMatch(reference_descriptors, test_descriptors, k=2)
matches_copy = []
for i, (m, n) in enumerate(flann_matches):
if m.distance < distanca_coefficient * n.distance:
matches_copy.append(m)
gsm_matches = cv2.xfeatures2d.matchGMS(reference_image.shape, test_image.shape, keypoints1=reference_keypoints,
keypoints2=test_keypoints, matches1to2=matches_copy,
withRotation=gms_withRotation, withScale=gms_withScale,
thresholdFactor=gms_thresholdFactor)
#%% Starting reference list file creation
reference_init = datetime.now()
print('Start reference file list creation')
reference_image_process_list = []
manager = Manager()
reference_image_dictionary = manager.dict()
reference_image_list = manager.list()
for root, directories, files in os.walk(references_path):
for file in files:
if file.endswith('.DS_Store'):
continue
reference_image_path = os.path.join(root, file)
reference_name = file.split('.')[0]
image = cv2.imread(reference_image_path, cv2.IMREAD_GRAYSCALE)
reference_image_dictionary[reference_name] = {
'image': image,
'keypoints': None,
'descriptors': None
}
proc = Process(target=rootSIFT, args=(reference_image_list, reference_name, reference_image_path))
reference_image_process_list.append(proc)
proc.start()
for proc in reference_image_process_list:
proc.join()
reference_end = datetime.now()
reference_time = reference_end - reference_init
print('End reference file list creation, time required: ', reference_time)
I faced pretty much the same error. It seems that the code hangs at detectAndCompute in my case, not when creating the dictionary. For some reason, sift feature extraction is not multi-processing safe (to my understanding, it is the case in Macs but I am not totally sure.)
I found this in a github thread. Many people say it works but I couldn't get it worked. (Edit: I tried this later which works fine)
Instead I used multithreading which is pretty much the same code and works perfectly. Of course you need to take multithreading vs multiprocessing into account

What is the math behind TfidfVectorizer?

I am trying to understand the math behind the TfidfVectorizer. I used this tutorial, but my code is a little bit changed:
what also says at the end that The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations.
I want to be able to use TfidfVectorizer but also calculate the same simple sample by my hand.
Here is my whole code:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def main():
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
print(pd.DataFrame([tfA, tfB]))
CV = CountVectorizer(stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b')
cv_ft = CV.fit_transform(corpus)
tt = TfidfTransformer(use_idf=False, norm='l1')
t = tt.fit_transform(cv_ft)
print(pd.DataFrame(t.todense().tolist(), columns=CV.get_feature_names()))
print('----------- compare idf -------------------')
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(pd.DataFrame([idfs]))
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
print(pd.DataFrame([tfidfA, tfidfB]))
ttf = TfidfTransformer(use_idf=True, smooth_idf=False, norm=None)
f = ttf.fit_transform(cv_ft)
print(pd.DataFrame(f.todense().tolist(), columns=CV.get_feature_names()))
print('----------- TfidfVectorizer -------------------')
vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True, stop_words=None, token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
print(pd.DataFrame(vectors.todense().tolist(), columns=feature_names))
def computeTF(wordDict, bagOfWords):
tfDict = {}
bagOfWordsCount = len(bagOfWords)
for word, count in wordDict.items():
tfDict[word] = count / float(bagOfWordsCount)
return tfDict
def computeIDF(documents):
import math
N = len(documents)
idfDict = dict.fromkeys(documents[0].keys(), 0)
for document in documents:
for word, val in document.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log(N / float(val))
return idfDict
def computeTFIDF(tfBagOfWords, idfs):
tfidf = {}
for word, val in tfBagOfWords.items():
tfidf[word] = val * idfs[word]
return tfidf
if __name__ == "__main__":
main()
I can compare calculation of Term Frequency. Both results look the same. But when I calculate the IDF and then TF-IDF there are differences between the code from the website and TfidfVectorizer (I also try combination of CountVectorizer and TfidfTransformer to be sure it returns the same results like TfidfVectorizer does).
Code Tf-Idf results:
TfidfVectorizer Tf-Idf results:
Can anybody help me with a code that would return the same returns as TfidfVectorizer or setting of TfidfVectorizer what would return the same results as the code above?
Here is my improvisation of your code to reproduce TfidfVectorizer output for your data .
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from IPython.display import display
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
corpus = [documentA, documentB]
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print('----------- compare word count -------------------')
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
numOfWordsB[word] += 1
series_A = pd.Series(numOfWordsA)
series_B = pd.Series(numOfWordsB)
df = pd.concat([series_A, series_B], axis=1).T
df = df.reindex(sorted(df.columns), axis=1)
display(df)
tf_df = df.divide(df.sum(1),axis='index')
n_d = 1+ tf_df.shape[0]
df_d_t = 1 + (tf_df.values>0).sum(0)
idf = np.log(n_d/df_d_t) + 1
pd.DataFrame(df.values * idf,
columns=df.columns )
tfidf = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w*\\b', norm=None)
pd.DataFrame(tfidf.fit_transform(corpus).todense(),
columns=tfidf.get_feature_names() )
More details on the implementation refer the documentation here.

Goodness of fit always being zero despite taking random data?

I'm trying to write code that generates random data and computes goodness of fit but I'm not understanding why the chi-squared test is always zero, may I have a fix for this ? For an attempted fix I tried playing around with different types to see if I get any resulting changes in the initial output, also I've tried changing the parameters to the loop in question.
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a = random.randint(-10,10)
b = random.randint(-10,10)
y = a * (b + i)
observed_values.append(y)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = o_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for i in z_o_e_v_f:
t_s += [((o_v_f)-(e_v_f))]**2/(e_v_f) # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()
In your original code, e_v_f = o_v.astype(float) made o_v_f, e_v_f ending up the same. There was also some issue in the for loop. I have edited your code a bit. See what it does you are looking for:
from scipy import stats
import math
import random
import numpy
import scipy
import numpy as np
def Linear_Chi2_Generate(observed_values = [], expected_values = []):
#===============================================================#
# !!!!!!! Generation of Data !!!!!!!!!! #
#===============================================================#
for i in range(0,12):
a_o = random.randint(-10,10)
b_o = random.randint(-10,10)
y_o = a_o * (b_o + i)
observed_values.append(y_o)
# a_e = random.randint(-10,10)
# b_e = random.randint(-10,10)
# y_e = a_e * (b_e + i)
expected_values.append(y_o + 5)
#######################################################################################
# !!! Array Setup !!!! #
# ***Had the Array types converted to floats before computing Chi2*** #
# #
#######################################################################################
t_s = 0
o_v = np.array(observed_values)
e_v = np.array(expected_values)
o_v_f = o_v.astype(float)
e_v_f = e_v.astype(float)
z_o_e_v_f = zip(o_v.astype(float), e_v.astype(float))
######################################################################################
for o, e in z_o_e_v_f:
t_s += (o - e) **2 / e # Computs the Chi2 Stat !
######################################################################################
print("Observed Values ", o_v_f)
print("Expected Values" , e_v_f)
df=len(o_v_f)-1
print("Our goodness of fit for our linear function", stats.chi2.cdf(t_s,df))
return t_s
Linear_Chi2_Generate()

Having issues computing the average of compound sentiment values for each text file in a folder

# below is the sentiment analysis code written for sentence-level analysis
import glob
import os
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize
# Next, VADER is initialized so I can use it within the Python script
sid = SentimentIntensityAnalyzer()
# I will also initialize the 'english.pickle' function and give it a short
name
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#Each of the text file is listed from the folder speeches
files = glob.glob(os.path.join(os.getcwd(), 'cnn_articles', '*.txt'))
text = []
#iterate over the list getting each file
for file in files:
#open the file and then call .read() to get the text
with open(file) as f:
text.append(f.read())
text_str = "\n".join(text)
# This breaks up the paragraph into a list of strings.
sentences = tokenizer.tokenize(text_str )
sent = 0.0
count = 0
# Iterating through the list of sentences and extracting the compound scores
for sentence in sentences:
count +=1
scores = sid.polarity_scores(sentence)
sent += scores['compound'] #Adding up the overall compound sentiment
# print(sent, file=open('cnn_compound.txt', 'a'))
if count != 0:
sent = float(sent / count)
print(sent, file=open('cnn_compound.txt', 'a'))
With these lines of code, I have been able to get the average of all the compound sentiment values for all the text files. What I really want is the
average compound sentiment value for each text file, such that if I have 10
text files in the folder, I will have 10 floating point values representing
each of the text file. So that I can plot these values against each other.
Kindly assist me as I am very new to Python.
# below is the sentiment analysis code written for sentence-level analysis
import os, string, glob, pandas as pd, numpy as np
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize
# Next, VADER is initialized so I can use it within the Python
script
sid = SentimentIntensityAnalyzer()
exclude = set(string.punctuation)
# I will also initialize the 'english.pickle' function and give
it a short
name
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#Each of the text file is listed from the folder speeches
files = glob.glob(os.path.join(os.getcwd(), 'cnn_articles',
'*.txt'))
text = []
sent = 0.0
count = 0
cnt = 0
#iterate over the list getting each file
for file in files:
f = open(file).read().split('.')
cnt +=1
count = (len(f))
for sentence in f:
if sentence not in exclude:
scores = sid.polarity_scores(sentence)
print(scores)
break
sent += scores['compound']
average = round((sent/count), 4)
t = [cnt, average]
text.append(t)
break
df = pd.DataFrame(text, columns=['Article Number', 'Average
Value'])
#
#df.to_csv(r'Result.txt', header=True, index=None, sep='"\t\"
+"\t\"', mode='w')
df.to_csv('cnn_result.csv', index=None)

Resources