Feature Extraction using MFCC - python-3.x

I want to know, how to extract the audio (x.wav) signal, feature extraction using MFCC? I know the steps of the audio feature extraction using MFCC. I want to know the fine coding in Python using the Django framework

This is the most important step in building a speech recognizer because after converting the speech signal into the frequency domain, we must convert it into the usable form of the feature vector.
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from python_speech_features import mfcc, logfbank
frequency_sampling, audio_signal =
wavfile.read("/home/user/Downloads/OSR_us_000_0010_8k.wav")
audio_signal = audio_signal[:15000]
features_mfcc = mfcc(audio_signal, frequency_sampling)
print('\nMFCC:\nNumber of windows =', features_mfcc.shape[0])
print('Length of each feature =', features_mfcc.shape[1])
features_mfcc = features_mfcc.T
plt.matshow(features_mfcc)
plt.title('MFCC')
filterbank_features = logfbank(audio_signal, frequency_sampling)
print('\nFilter bank:\nNumber of windows =', filterbank_features.shape[0])
print('Length of each feature =', filterbank_features.shape[1])
filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')
plt.show()
or you may use this code to extract the feature
import numpy as np
from sklearn import preprocessing
import python_speech_features as mfcc
def extract_features(audio,rate):
"""extract 20 dim mfcc features from an audio, performs CMS and combines
delta to make it 40 dim feature vector"""
mfcc_feature = mfcc.mfcc(audio,rate, 0.025, 0.01,20,nfft = 1200, appendEnergy = True)
mfcc_feature = preprocessing.scale(mfcc_feature)
delta = calculate_delta(mfcc_feature)
combined = np.hstack((mfcc_feature,delta))
return combined

you can use following code to extract an audio file MFCC features using librosa package(it is easy to install and work):
import librosa
import librosa.display
audio_path = 'my_audio_file.wav'
x, sr = librosa.load(audio_path)
mfccs = librosa.feature.mfcc(x, sr=sr,n_mfcc=40)
print(mfccs.shape)
also you can Display the MFCCs using following code:
librosa.display.specshow(mfccs, sr=sr, x_axis='time')

Related

Fastai for time series regression

So I have been using fastai library for a couple of years now. Recently, I came upon the extension library dedicated for the time series analysis - tsai
I am trying to perform simple regression task on the famous airpassengers dataset.
I have no idea what I am doing wrong:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import torch
import random
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# fastai
from fastai import *
from fastai.text import *
from fastai.text.all import *
from tsai.all import *
flight_data = sns.load_dataset("flights")
flight_data.head(20)
scaler = MinMaxScaler(feature_range=(-1, 1))
# flight_data['passengers'] = scaler.fit_transform(flight_data['passengers'].values.reshape(-1, 1)).flatten()
plt.figure(figsize=(10, 4))
plt.plot(flight_data['passengers'])
def create_inout_sequences(input_data, tw):
inout_seq = []
label_seq = []
L = len(input_data)
for i in range(L-tw):
train_seq = input_data[i:i+tw]
train_label = input_data[i+tw:i+tw+1]
inout_seq.append(train_seq)
label_seq.append(train_label)
return np.array(inout_seq), np.array(label_seq)
data = flight_data['passengers'].values
x, y = create_inout_sequences(data, 15)
src = itemify(x, y)
yy = y.reshape(-1)
xx = x.reshape(-1)
tfms = [None, [TSRegression()]]
batch_tfms = TSStandardize(by_sample=True, by_var=True)
dls = get_ts_dls(x, yy, tfms=tfms, bs=64)
dls.show_batch()
dls.one_batch()
dls.c
learn = ts_learner(dls, InceptionTime, metrics=[mae, rmse], cbs=ShowGraph())
learn.lr_find()

numpy data sounds different than original sound_file.wav file

import wave
import numpy as np
from IPython.display import display, Audio
#sound 1
with wave.open('sound_file.wav', 'rb') as wf:
signal = np.frombuffer(wf.readframes(nframes = wf.getnframes()), 'int' + str(int(16 * wf.getsampwidth())))
display(Audio(data = signal, rate = wf.getframerate()))
#sound2
display(Audio('sound_file.wav'))
here the sound1 sounds different than sound2 so can anyone tell me what happens there.
As well as please state some usual practices of sound preprocessing that must be done after getting an np array from a sound file.

How to iterate through audio files when converting into mfccs

I am a beginner, i am converting audio files into mfccs , i have done it for one file but don't know how to iterate it through all dataset. I have multiple folders in Training folder ,one of them is 001(0) from which one wav file is converted.I want to convert all folder's wav files present in Training folder
import os
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import scipy.io.wavfile as wav
from python_speech_features import mfcc, logfbank
# Read the input audio file
(rate,sig) = wav.read('Downloads/DataVoices/Training/001(0)/001000.wav')
# Take the first 10,000 samples for analysis
sig = sig[:10000]
features_mfcc = mfcc(sig,rate)
# Print the parameters for MFCC
print('\nMFCC:\nNumber of windows =', features_mfcc.shape[0])
print('Length of each feature =', features_mfcc.shape[1])
# Plot the features
features_mfcc = features_mfcc.T
plt.matshow(features_mfcc)
plt.title('MFCC')
# Extract the Filter Bank features
features_fb = logfbank(sig, rate)
# Print the parameters for Filter Bank
print('\nFilter bank:\nNumber of windows =', features_fb.shape[0])
print('Length of each feature =', features_fb.shape[1])
# Plot the features
features_fb = features_fb.T
plt.matshow(features_fb)
plt.title('Filter bank')
plt.show()
You can use glob recursively with wildcards to find all of the wav files.
for f in glob.glob(r'Downloads/DataVoices/Training/**/*.wav', recursive=True):
(rate,sig) = wav.read(f)
# Rest of your code

Import PDF Image From MatPlotLib to ReportLab

I am trying to insert a saved PDF image into a ReportLab flowable.
I have seen several answers to similar questions and many involve using Py2PDF like this:
import PyPDF2
import PIL
input1 = PyPDF2.PdfFileReader(open(path+"image.pdf", "rb"))
page0 = input1.getPage(0)
xObject = page0['/Resources']['/XObject'].getObject()
for obj in xObject:
#Do something here
The trouble I'm having is with a sample image I've saved from MatPlotLib as a PDF. When I try to access that saved image with the code above, it returns nothing under page0['/Resources']['/XObject'].
In fact, here's what I see when I look at page0 and /XObject:
'/XObject': {}
Here's the code I used to generate the PDF:
import matplotlib.pyplot as plt
import numpy as np
# Fixing random state for reproducibility
np.random.seed(19680801)
plt.rcdefaults()
fig, ax = plt.subplots()
# Example data
people = ('Tom', 'Dick', 'Harry', 'Slim', 'Jim')
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))
ax.barh(y_pos, performance, xerr=error, align='center',
color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('Performance')
ax.set_title('How fast do you want to go today?')
plt.savefig(path+'image.pdf',bbox_inches='tight')
Thanks in advance!

Why is the plot in librosa different?

I am currently trying using librosa to perform stfft, such that the parameter resembles a stfft process from a different framework (Kaldi).
The audio file is fash-b-an251
Kaldi does it using a sample frequency of 16 KHz, window_size = 400 (25ms), hop_length=160 (10ms).
The spectrogram extracted from this looks like this:
I then tried to do the same using librosa:
import numpy as np
import sys
import librosa
import os
import scipy
import matplotlib.pyplot as plt
from matplotlib import cm
# Input parameter
# relative_path_to_file
if len(sys.argv) < 1:
print "Missing Arguments!"
print "python spectogram_librosa.py path_to_audio_file"
sys.exit()
path = sys.argv[1]
abs_path = os.path.abspath(path)
spectogram_dnn = "/home/user/dnn/spectogram"
if not os.path.exists(spectogram_dnn):
print "spectogram_dnn folder didn't exist!"
os.makedirs(spectogram_dnn)
print "Created!"
y,sr = librosa.load(abs_path,sr=16000)
D = librosa.logamplitude(np.abs(librosa.core.stft(y, win_length=400, hop_length=160, window=scipy.signal.hanning,center=False)), ref_power=np.max)
librosa.display.specshow(D,sr=16000,hop_length=160, x_axis='time', y_axis='log', cmap=cm.jet)
plt.colorbar(format='%+2.0f dB')
plt.title('Log power spectrogram')
plt.show()
raw_input()
sys.exit()
Which is basically taken from here:
In which i've modified the stfft function such that it fits my parameters..
Problems is that is creates an entirely different plot..
So.. What am I doing wrong in librosa?.. Why is this plot so much different, from the one created in kaldi.
Am I missing something?
It has to do with the Hz scale. The one in the first image is linear while the one in the second image is logarithmic. You can fix it by either changing the scale in either of the images to match the other.

Resources