How can I transcribe audio in real time with pyaudio - pytorch

I'm having a hard time implementing real time audio transcriptions with a microphone using the pyaudio library. I've developed my own speech recognition model with pytorch. The model is given wrong predictions most of the time. This is the sample code:
import torch
import pyaudio as pa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC,AutoTokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2- base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
frames_per_buffer = 3200
format = pyaudio.paInt16
channel = 1
rate = 16000
p = pyaudio.PyAudio()
def record_microphone():
stream = p.open(rate=rate,channels=channel,format = format,\
input=True,frames_per_buffer=frames_per_buffer)
frames = []
seconds=1
for i in range(0,int(rate/frames_per_buffer * seconds)):
data = stream.read(frames_per_buffer)
frames.append(data)
stream.stop_stream()
stream.close()
return np.frombuffer(b''.join(frames),dtype=np.int16)
def terminate():
p.terminate()
def predict():
audio = record_microphone()
input_values = processor(torch.FloatTensor(audio),sampling_rate=16_000,
return_tensors="pt", padding="longest").input_values
logits = model(input_values).logits
pred = torch.argmax(logits,dim=-1)
pred_word = processor.batch_decode(pred)[0]
print(pred_word.lower()+" ",end="")
return pred_word
if "__name__" == "__main__":
try:
predict()
finally:
terminate()

Related

Using pyaudio and Speech Recognition at the same time

I want to record the audio and get an audiofile while using Speech Recognition. For some reason my program always crashes after a few moments. It also does not come to creating the audiofile.
I suspect there is a problem with using threads as both processes worked fine on their own. Unfortunately I could not find anything.
Does anyone have an idea how I can fix this or how I can use Speech Recognition while recording sound?
import threading
import speech_recognition as sr
import pyaudio
import wave
import time
status = True
def record():
chunk = 1024
sample_format = pyaudio.paInt16
channels = 1
fs = 44100
filename = 'output.wav'
global status
p = pyaudio.PyAudio()
print('Recording')
stream = p.open(format=sample_format,
channels=channels,
rate=fs,
frames_per_buffer=chunk,
input=True)
frames = []
while status == True:
data = stream.read(chunk)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
print('Finished recording')
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()
def get_audio():
while True:
r = sr.Recognizer()
with sr.Microphone() as source:
print("Höre zu ...")
audio = r.listen(source)
said = ''
try:
said = r.recognize_google(audio, language="de_DE")
print(said)
except Exception as e:
print('')
thread1=threading.Thread(target=record)
thread1.start()
thread2=threading.Thread(target=get_audio)
thread2.start()
time.sleep(5)
status=False
You can record and save sound with Speech Recognition. Just use this part of the code and it will create a speech.wav file:
def get_audio():
while True:
r = sr.Recognizer()
with sr.Microphone() as source:
print("Höre zu ...")
audio = r.listen(source)
with open('speech.wav', 'wb') as f:
f.write(audio.get_wav_data())
try:
said = r.recognize_google(audio, language="de_DE")
print(said)
except Exception as e:
print('')

How do you parse the bin file from INT8 Calibration of TensorRT?

I have created a python script for calibrating(INT8) the dynamic scales of the activation of TinyYOLO V2 using TensorRT. The script gave me a file called calibration_cache.bin. How do I parse the .bin file ? What do the values inside the .bin file mean ?
calibrator.py
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from PIL import Image
import ctypes
import tensorrt as trt
import os
CHANNEL = 3
HEIGHT = 416
WIDTH = 416
class PythonEntropyCalibrator(trt.IInt8EntropyCalibrator):
def __init__(self, input_layers, stream):
trt.IInt8EntropyCalibrator.__init__(self)
self.input_layers = input_layers
self.stream = stream
self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
stream.reset()
def get_batch_size(self):
return self.stream.batch_size
def get_batch(self, bindings, names):
batch = self.stream.next_batch()
if not batch.size:
return None
cuda.memcpy_htod(self.d_input, batch)
for i in self.input_layers[0]:
assert names[0] != i
bindings[0] = int(self.d_input)
return bindings
def read_calibration_cache(self, length=0):
if os.path.exists('calibration_cache.bin'):
with open('calibration_cache.bin', 'rb') as f:
return f.read()
return None
def write_calibration_cache(self, cache, size=0):
with open('calibration_cache.bin', 'wb') as f:
f.write(cache)
return None
class ImageBatchStream():
def __init__(self, batch_size, calibration_files, preprocessor):
self.batch_size = batch_size
self.max_batches = (len(calibration_files) // batch_size) + \
(1 if (len(calibration_files) % batch_size) \
else 0)
self.files = calibration_files
self.calibration_data = np.zeros((batch_size, CHANNEL, HEIGHT, WIDTH), \
dtype=np.float32)
self.batch = 0
self.preprocessor = preprocessor
#staticmethod
def read_image_chw(path):
img = Image.open(path).resize((WIDTH,HEIGHT), Image.NEAREST)
im = np.array(img, dtype=np.float32, order='C')
im = im[:,:,::-1]
im = im.transpose((2,0,1))
return im
def reset(self):
self.batch = 0
def next_batch(self):
if self.batch < self.max_batches:
imgs = []
files_for_batch = self.files[self.batch_size * self.batch : \
self.batch_size * (self.batch + 1)]
for f in files_for_batch:
self.batch_size * (self.batch + 1)]
for f in files_for_batch:
print("[ImageBatchStream] Processing ", f)
img = ImageBatchStream.read_image_chw(f)
img = self.preprocessor(img)
imgs.append(img)
for i in range(len(imgs)):
self.calibration_data[i] = imgs[i]
self.batch += 1
return np.ascontiguousarray(self.calibration_data, dtype=np.float32)
else:
return np.array([])
test.py
from random import shuffle
from PIL import Image
import glob
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import os
from calibrator import *
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
model_file = './tiny_yolov2/Model.onnx'
dataset_loc = './Dataset/*.jpg'
def normalize(data):
data /= 255.0
return data
def create_calibration_dataset():
calibration_files = glob.glob(dataset_loc)
shuffle(calibration_files)
return calibration_files[:20]
calibration_files = create_calibration_dataset()
NUM_IMAGES_PER_BATCH = 5
batchstream = ImageBatchStream(NUM_IMAGES_PER_BATCH, calibration_files, normalize)
Int8_calibrator = PythonEntropyCalibrator(["conv2d_91_input"], batchstream)
builder = trt.Builder(TRT_LOGGER)
builder.int8_calibrator = Int8_calibrator
builder.refittable = True
builder.int8_mode = True
network = builder.create_network()
parser = trt.OnnxParser(network, TRT_LOGGER)
print(builder.int8_mode, builder.platform_has_fast_int8,builder.refittable)
with open(model_file, 'rb') as model:
parser.parse(model.read())
print('Done reading ONNX File\n')
engine = builder.build_cuda_engine(network)
print(engine, TRT_LOGGER)
with open("model.trt", "wb") as f:
f.write(engine.serialize())
print("Done converting the ONNX to TRT\n")
tinyolo_fitter = trt.Refitter(engine, TRT_LOGGER)
print(tinyolo_fitter.refit_cuda_engine())
print(tinyolo_fitter.get_tensors_with_dynamic_range())
calibration_cache.bin
TRT-5105-EntropyCalibration
image: 3c010a14
scalerPreprocessor_scaled: 38018ba0
image2: 38018ba0
batchnormalization_1_output: 3d07b31d
leakyrelu_1_output: 3c98a317
maxpooling2d_1_output: 3c1e5b30
batchnormalization_2_output: 3ca6aa67
leakyrelu_2_output: 3ca6aa67
maxpooling2d_2_output: 3c82cf7d
batchnormalization_3_output: 3ce07ce8
leakyrelu_3_output: 3ce52236
maxpooling2d_3_output: 3cc8ed6f
batchnormalization_4_output: 3d3df55f
leakyrelu_4_output: 3c651727
maxpooling2d_4_output: 3cec84fc
batchnormalization_5_output: 3d0f51e3
leakyrelu_5_output: 3cb52377
maxpooling2d_5_output: 3d026049
batchnormalization_6_output: 3d387291
leakyrelu_6_output: 3ccc009a
maxpooling2d_6_output: 3c8d0f0c
batchnormalization_7_output: 3e0de3d2
leakyrelu_7_output: 3d7b4397
batchnormalization_8_output: 3cc459d6
leakyrelu_8_output: 3cbd9562
grid: 3ddc32dc
def read_calibration_cache(self, length=0):
if os.path.exists('calibration_cache.bin'):
with open('calibration_cache.bin', 'rb') as f:
return f.read()
return None
This does the work i guess. If there is a calibration_cache.bin file in your dir, calibrator parses it instead of calibrating again.

pyaudio callback called only once

I try use pyaudio with the callback option, and I want to yield the data instead of reading from a file. When I use the callback option, it gets called only once.
There is another question with the same problem, but it doesn't have an answer. I have made a minimal reproducable example. The code works when blocking is used.
import time
import numpy as np
import scipy.signal
import sounddevice as sd
import pyaudio
sample_rate=44100
max_amp = 2**(15)-1
f0 = 500
duration = 1
f1 = 3000
x = np.arange(0, duration, 1/sample_rate)
y_float = max_amp*scipy.signal.chirp(x, f0, duration, f1)
y = y_float.astype(np.int16)
data = y.tostring()
def create_data_generator(data):
periodsize = 1000
for i in range(int(len(data)/(periodsize))):
chunk = data[periodsize*i:periodsize*(i+1)]
yield chunk
data_generator = create_data_generator(data)
def callback(in_data, frame_count, time_info, status):
data = next(data_generator)
return (data, pyaudio.paContinue)
# -------- blocking------------------
# periodsize = 1000
# p = pyaudio.PyAudio()
# stream = p.open(format=pyaudio.paInt16,
# channels=1,
# rate=sample_rate,
# output=True)
# start = time.time()
# for i in range(int(len(data)/(periodsize))):
# chunk, status = callback(0, 0, 0, 0)
# stream.write(chunk)
# time.sleep(duration-(time.time()-start))
# stream.stop_stream()
# stream.close()
# p.terminate()
# -------- callback ------------------
periodsize = 1000
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
output=True,
stream_callback=callback)
# start the stream (4)
stream.start_stream()
# wait for stream to finish (5)
while stream.is_active():
time.sleep(0.1)
# stop stream (6)
stream.stop_stream()
stream.close()
# close PyAudio (7)
p.terminate()

I am unable to simultaneously stream the feed from my RBP3's camera and record to a file at the same time using python

I know how to save to a file using the code below (and timestamp the feed) and I know how to stream using uv4l but I am simply too bad to do it simultaneously.
import time
time.sleep(60)
import picamera
import datetime as dt
camera = picamera.PiCamera()
camera.resolution = (640, 480)
#camera.vflip = True
camera.led = False
x = 0
while True:
bideoname = "/media/pi/cam/" + dt.datetime.now().strftime('%Y-%m-%d-%H') + ".h264"
camera.annotate_background = picamera.Color('black')
camera.annotate_text = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#camera.start_preview()
camera.start_recording(bideoname)
start = dt.datetime.now()
while (dt.datetime.now() - start).seconds < 3600:
camera.annotate_text = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
camera.wait_recording(0.2)
camera.stop_recording()
x = x+1
I imagine I would use flask to to create local website to stream the feed to.
I have looked up and down the internets and this example seems to be the closest solution by Dave Jones but I don't if socket can communicate with a browser:
https://raspberrypi.stackexchange.com/questions/27041/record-and-stream-video-from-camera-simultaneously
Also there is this code which streams the camera feed to a page but no mention of how to simultanously record as well:
from flask import Flask, render_template, Response
# Raspberry Pi camera module (requires picamera package, developed by Miguel Grinberg)
from camera_pi import Camera
app = Flask(__name__)
#app.route('/')
def index():
"""Video streaming home page."""
return render_template('index.html')
def gen(camera):
"""Video streaming generator function."""
while True:
frame = camera.get_frame()
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n')
#app.route('/video_feed')
def video_feed():
"""Video streaming route. Put this in the src attribute of an img tag."""
return Response(gen(Camera()),
mimetype='multipart/x-mixed-replace; boundary=frame')
if __name__ == '__main__':
app.run(host='0.0.0.0', port =80, debug=True, threaded=True)
Or maybe this is all wrong and there is simpler solution to this?
Thanks for the help.

Continuesly streaming audio signal real time infinitely, Python

I have a simple question, while streaming audio signal from audio jack in Python, using pyaudio library how can I keep streaming the audio signal until I choose to "stop" the program.
Example: The way we capture our web camera frames infinitely under a infinite while loop.
For example: In this code(take from link)that records the stream just for 5 seconds what will be the modification that will serve my purpose
import pyaudio
import wave
import numpy as np
CHUNK = 44100
FORMAT = pyaudio.paInt32
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
audio_data = np.fromstring(data, dtype=np.int32)
print(data)
print(audio_data)
frames.append(data)
print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
Also the code given on this link (Handling audio data using callback mode) records it for 4-5 seconds.
I will be really grateful if someone could help me with this!!
Well , Meanwhile I figured out solution
import pyaudio
import numpy as np
import pylab
import time
import sys
import matplotlib.pyplot as plt
RATE = 44100
CHUNK = int(RATE/20) # RATE / number of updates per second
def soundplot(stream):
t1=time.time()
#use np.frombuffer if you face error at this line
data = np.fromstring(stream.read(CHUNK),dtype=np.int16)
print(data)
if __name__=="__main__":
p=pyaudio.PyAudio()
stream=p.open(format=pyaudio.paInt16,channels=1,rate=RATE,input=True,
frames_per_buffer=CHUNK)
for i in range(sys.maxsize**10):
soundplot(stream)
stream.stop_stream()
stream.close()
p.terminate()
And this post here will help you in simple and concrete way
Hello this is my code with which audio and video is recorded separately and pause the audio and video I hope it helps you
import cv2
import numpy as np
from datetime import datetime
import gtk
import keyboard
import pyaudio
import wave
import sys
flagrecord=True
#chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
ropen=True
chunk = int(RATE/20)
def show_webcam(flagrecord):
cam = cv2.VideoCapture(0)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
frame_width = int(cam.get(3))
frame_height = int(cam.get(4))
FONT = cv2.FONT_HERSHEY_PLAIN
filename = datetime.now().strftime("%Y-%m-%d_%H.%M.%S") + ".avi"
filenamea = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
p = pyaudio.PyAudio()
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
frames_per_buffer = chunk)
out = cv2.VideoWriter(filename,fourcc, 20, (frame_width,frame_height))
all = []
aux = []
stream.start_stream()
flagaudio=False
while True:
ret_val, img = cam.read()
title = datetime.now().strftime("%Y-%m-%d*%H:%M:%S")
if flagrecord:
img = cv2.flip(img,1)
cv2.putText(img, "REC", (40,40), FONT, 3 , (0,0,255), 3)
cv2.circle(img, (20,20), 10 , (0,0,255), -1)
cv2.rectangle(img, (30,430),(600,480),(0,0,0), -1)
cv2.putText(img, title, (40,470), FONT, 3 , (255,255,255), 2)
cv2.imshow('Grabacion de Audiencias', img)
data = stream.read(chunk)
aux.append(data)
out.write(img)
else:
img = cv2.flip(img,1)
cv2.putText(img, "PAUSE", (40,40), FONT, 3 , (255,0,0), 3)
cv2.circle(img, (20,20), 10 , (255,0,0), -1)
cv2.rectangle(img, (50,430),(570,480),(0,0,0), -1)
cv2.putText(img, "Audiencias En Pausa", (60,470), FONT, 3 , (255,0,0), 2)
cv2.imshow('Grabacion de Audiencias', img)
if flagaudio:
all+=aux
del aux[:]
data= 0
stream.stop_stream()
else:
pass
q=cv2.waitKey(1)
if q == 27:
break
if q == ord('p'):
flagrecord=False
flagaudio = True
if q == ord('c'):
flagrecord=True
flagaudio=False
stream.start_stream()
if q == ord('q'):
break
cam.release()
out.release()
cv2.destroyAllWindows()
stream.close()
p.terminate()
all+=aux
data = ''.join(all)
wf = wave.open(filenamea, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
def main():
show_webcam(mirror=True)
if __name__ == '__main__':
main()

Resources