audio buffer with librosa - audio

In the following code I created a buffer which holds 10 frames of an audio file in each loop iteration.
import collections
import librosa
import wave
my_buffer = collections.deque(maxlen=10)
f = wave.open('Desktop/0963.wav',"rb")
num_frames = f.getnframes()
for frame in range(num_frames):
my_buffer.append(f.readframes(frame))
Out of the buffer, I need to get a numpy array representing audio amplitude of each sample point with librosa. Any idea?

If you use scipy.io.wavfile , it will directly read a wave file and load data to an numpy array. Which you can then slice as per your requirements.
scipy.io.wavfile reads a WAV file and returns the sample rate (in samples/sec) and data from the WAV file
>>> type(f)
<type 'tuple'>
>>> f
(44100, array([-36, 57, 156, ..., 66, 64, 77], dtype=int16))
>>>
Source Code
from scipy.io.wavfile import read
import numpy as np
f = read('your_audio.wav')
n = np.array(f[1],dtype=float)
for i in xrange(0,len(n),10):
my_buffer = n[i:i+10]
my_buffer contents:
>>>
[ -36. 57. 156. 198. 191. 126. 70. 42. 43. 62.]
[ 69. 71. 83. 117. 159. 177. 151. 89. 14. -27.]
[ -33. -4. 21. 38. 42. 66. 94. 134. 144. 142.]
[ 118. 115. 111. 132. 122. 123. 103. 119. 125. 134.]
.....
.....
Here we have my_buffer with 10 frames per iteration that you can feed into next block.

As mentioned above, scipy.io.wavfile is a good module for reading in and handling audio. If you want to stick with librosa, you can use this to do the same:
import librosa
filepath = 'Desktop/0963.wav'
samplerate = 44100
audio, samplerate = librosa.load(filepath, sr=samplerate)
audio.shape
What I like about librosa.load is that you can specify any sample rate to downsample an audio file to. Hope this helps.

Related

Numpy Array - Audio file

I am trying to analyze some audio file. I am using several helper function that helps me to do some data transformations task.
from typing import List, Tuple
import webrtcvad
import numpy.typing as npt
import numpy as np
import os
from scipy.io.wavfile import read, write
vad = webrtcvad.Vad()
vad.set_mode(2)
def _validate_array_dtype(array: npt.NDArray):
if not isinstance(array, np.ndarray):
raise ValueError(f"Chunk has to be type np.ndarray but is {type(array)}")
if array.dtype != np.int16:
raise ValueError(
f"Chunk array has to be dtype np.int16 but is {array.dtype}")
def is_speech(frame: npt.NDArray, sample_rate: int) -> bool:
"""Detect if this audio segment is speech or not.
Args:
frame: Numpy array (dtype int16) of audio waveform. Duration needs to be exactly 10, 20 or 30 ms.
sample_rate: Sample rate of the audio frame in Hz.
Returns:
Whether this frame contained speech.
"""
if len(frame.shape) != 1:
raise ValueError("Frame must be a 1D numpy array.")
_validate_array_dtype(frame)
samples_per_ms = (sample_rate) // 1000 # samples for 10 ms
allowed_lengths = {
10 * samples_per_ms, 20 * samples_per_ms, 30 * samples_per_ms
}
if frame.shape[0] not in allowed_lengths:
raise ValueError("Frame must have duration of either 10, 20 or 30 ms.")
raw_frame = frame.tobytes()
return vad.is_speech(raw_frame, sample_rate)
def read_wav(wav_path: str) -> Tuple[int, npt.NDArray]:
"""Read data from a .wav audio file into a numpy array.
Args:
wav_path: Path to the wav file to load.
Returns:
A tuple containing, the sample rate of the audio file loaded and raw audio data as numpy array (dtype int16).
"""
return read(wav_path)
I am struggling to understand how is_speech function is working.
Once I call
is_speech(np.array([1,1,1,1,1,1,1,1,1,1], dtype=np.int16), sample_rate=10)
It returns me the following error: Value error: Frame must have duration of either 10, 20 or 30 ms.
I pass to is_speech 10 elements it returns this error. I can see from definition of function this value error raise(catch), but fail to get how to use this function properly. Would you like to help me how to use this function? Many thanks in advance.

How to process output into signed 16-bit big-endian integers

need to make this equation output the data as signed 16-bit big-endian integers for a wave (WAV) file: how is the most efficient way to express that in this equation?
import numpy as np
import pyglet
import sox
# sample rate in Hz
sample_rate = 44100
# generate a 1-second sine tone at 440 Hz
y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
print(y) #prints first and last in array y (its floating point)
# really need signed 16 bit big endian integers for a wave file

MGCA technique for speech features extraction shows this error (IndexError: list index out of range)

By executing this program for speech features extraction from wav file
, i got problem in code ,error say IndexError: list index out of range
File
"C:/Users/KALEEM/PycharmProjects/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques/mgca.py",
line 77, in
mel_Generalized() File "C:/Users/KALEEM/PycharmProjects/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques/mgca.py",
line 74, in mel_Generalized
mgca_feature_extraction(wav) File "C:/Users/KALEEM/PycharmProjects/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques/mgca.py",
line 66, in mgca_feature_extraction
writeFeatures(mgca_features,wav) File "C:/Users/KALEEM/PycharmProjects/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques/mgca.py",
line 46, in writeFeatures
wav = makeFormat(wav) File "C:/Users/KALEEM/PycharmProjects/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques/mgca.py",
line 53, in makeFormat
wav = wav.split('/')[1].split('-')[1] IndexError: list index out of range
Process finished with exit code 1
#!usr/bin/python
from pysptk import *
from scipy import hamming
import numpy.matlib
import scipy
import scipy.io.wavfile as wav
import numpy as np
import wave
from python_speech_features.sigproc import *
from math import *
from six.moves import input as raw_input
def readWavFile(wav):
#given a path from the keyboard to read a .wav file
#wav = raw_input('Give me the path of the .wav file you want to read: ')
inputWav = 'C:/Users/KALEEM/PycharmProjects/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques'+wav
return inputWav
#reading the .wav file (signal file) and extract the information we need
def initialize(inputWav):
rate , signal = wav.read(readWavFile(inputWav)) # returns a wave_read object , rate: sampling frequency
sig = wave.open(readWavFile(inputWav))
# signal is the numpy 2D array with the date of the .wav file
# len(signal) number of samples
sampwidth = sig.getsampwidth()
print ('The sample rate of the audio is: ',rate)
print ('Sampwidth: ',sampwidth)
return signal , rate
#implementation of the low-pass filter
def lowPassFilter(signal, coeff=0.97):
return np.append(signal[0], signal[1:] - coeff * signal[:-1]) #y[n] = x[n] - a*x[n-1] , a = 0.97 , a>0 for low-pass filters
def preEmphasis(wav):
#taking the signal
signal , rate = initialize(wav)
#Pre-emphasis Stage
preEmphasis = 0.97
emphasizedSignal = lowPassFilter(signal)
Time=np.linspace(0, len(signal)/rate, num=len(signal))
EmphasizedTime=np.linspace(0, len(emphasizedSignal)/rate, num=len(emphasizedSignal))
return emphasizedSignal, signal , rate
def writeFeatures(mgca_features,wav):
#write in a txt file the output vectors of every sample
f = open('mel_generalized_features.txt','a')#sample ID
#f = open('mfcc_featuresLR.txt','a')#only to initiate the input for the ROC curve
wav = makeFormat(wav)
np.savetxt(f,mgca_features,newline=",")
f.write(wav)
f.write('\n')
def makeFormat(wav):
#if i want to keep only the gender (male,female)
wav = wav.split('/')[1].split('-')[1]
#only to make the format for Logistic Regression
if (wav=='Female'):
wav='1'
else:
wav='0'
return wav
def mgca_feature_extraction(wav):
#I pre-emphasized the signal with a low pass filter
emphasizedSignal,signal,rate = preEmphasis(wav)
#and now I have the signal windowed
emphasizedSignal*=np.hamming(len(emphasizedSignal))
mgca_features = 'mgcep(emphasizedSignal,order=12)'
writeFeatures(mgca_features,wav)
def mel_Generalized():
folder = raw_input('Give the name of the folder that you want to read data: ')
amount = raw_input('Give the number of samples in the specific folder: ')
for x in range(1,int(amount)+1):
wav = '/'+folder+'/'+str(x)+'.wav'
print (wav)
mgca_feature_extraction(wav)
#def main():
mel_Generalized()
#main()
The problem is most likely due to unexpected input, which would be difficult for us to test.
More specifically, in the code below:
def makeFormat(wav):
#if i want to keep only the gender (male,female)
wav = wav.split('/')[1].split('-')[1]
#only to make the format for Logistic Regression
if (wav=='Female'):
wav='1'
else:
wav='0'
return wav
I would assume that wav is a str-like object (or anyway something that supports .split()). The result of split is generally an Iterable, for example a list. If such Iterable has 0 or 1 elements, trying to access its second element (using [1]) would raise the IndexError: list index out of range you are getting.
In your case, wav does not contain enough / (at least 1), enough - (also at least 1), or both.

ValueError: operands could not be broadcast together with shapes > (400,2) (400,)

Dear concerns : I am extracting features from wav , using PLP , this (
Pyhton 3.6 -Anaconda Spyder ) after execute i am facing error in this
line
File
"C:\ProgramData\Anaconda3\lib\site-packages\sidekit\frontend\features.py",
line 399, in power_spectrum ahan = framed[start:stop, :] * window
ValueError: operands could not be broadcast together with shapes
(400,2) (400,)
#!usr/bin/python
import numpy.matlib
import scipy
from scipy.fftpack.realtransforms import dct
from sidekit.frontend.vad import pre_emphasis
from sidekit.frontend.io import *
from sidekit.frontend.normfeat import *
from sidekit.frontend.features import *
import scipy.io.wavfile as wav
import numpy as np
def readWavFile(wav):
#given a path from the keyboard to read a .wav file
#wav = raw_input('Give me the path of the .wav file you want to read: ')
inputWav = 'C:/Speech_Processing/2-Speech_Signal_Processing_and_Classification-master/feature_extraction_techniques'+wav
return inputWav
#reading the .wav file (signal file) and extract the information we need
def initialize(inputWav):
rate , signal = wav.read(readWavFile(inputWav)) # returns a wave_read object , rate: sampling frequency
sig = wave.open(readWavFile(inputWav))
# signal is the numpy 2D array with the date of the .wav file
# len(signal) number of samples
sampwidth = sig.getsampwidth()
print ('The sample rate of the audio is: ',rate)
print ('Sampwidth: ',sampwidth)
return signal , rate
def PLP():
folder = input('Give the name of the folder that you want to read data: ')
amount = input('Give the number of samples in the specific folder: ')
for x in range(1,int(amount)+1):
wav = '/'+folder+'/'+str(x)+'.wav'
print (wav)
#inputWav = readWavFile(wav)
signal,rate = initialize(wav)
#returns PLP coefficients for every frame
plp_features = plp(signal,rasta=True)
meanFeatures(plp_features[0])
#compute the mean features for one .wav file (take the features for every frame and make a mean for the sample)
def meanFeatures(plp_features):
#make a numpy array with length the number of plp features
mean_features=np.zeros(len(plp_features[0]))
#for one input take the sum of all frames in a specific feature and divide them with the number of frames
for x in range(len(plp_features)):
for y in range(len(plp_features[x])):
mean_features[y]+=plp_features[x][y]
mean_features = (mean_features / len(plp_features))
print (mean_features)
def main():
PLP()
main()

convert numpy array to AudioFileClip in MoviePy

I am trying to convert a numpy array of an audio file sampled at 44100 Hz into an AudioFileClip in MoviePy so I can overdub a videoFileClip. The online documentation is unclear on this topic.
Any advice?
Thanks.
The relevant class is AudioArrayClip in AudioClip.py.
Here are a couple of examples of how to generate 2 seconds of mono and stereo random noise:
import numpy as np
from moviepy.audio.AudioClip import AudioArrayClip
rate = 44100 # Sampling rate in samples per second.
duration = 2 # Duration in seconds
data_mono = np.random.uniform(-1, 1, (int(duration*rate/2), 1))
data_stereo = np.random.uniform(-1, 1, (rate*duration, 2))
audio_mono = AudioArrayClip(data_mono, fps=rate)
audio_stereo = AudioArrayClip(data_stereo, fps=rate)
audio_mono.write_audiofile('mono.mp3')
audio_stereo.write_audiofile('stereo.mp3')
Edit: Update workaround to get correct duration of mono file (python 3.7.5, moviepy 1.0.0)

Resources