Google speech to text live stream, single_utterance is not working - node.js

I'm trying live stream speech to text using Google. I have installed node into my server.
I have successfully implemented it but I want google to recognize when the user stops to speaking. Google explained how to do that using single_utterance=true but it is not taking effect. Can you please tell what issue is there in the below code. Thank you!
var request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
//profanityFilter: false,
enableWordTimeOffsets: true,
//single_utterance: true
// speechContexts: [{
// phrases: ["hoful","shwazil"]
// }] // add your own speech context for better recognition
},
interimResults: true, // If you want interim results, set this to true
singleUtterance: true
};
function startRecognitionStream(client, data) {
console.log(request);
recognizeStream = speechClient.streamingRecognize(request)
.on('error', console.error)
.on('data', (data) => {
process.stdout.write(
(data.results[0] && data.results[0].alternatives[0])
? `Transcription: ${data.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`);
client.emit('speechData', data);
// if end of utterance, let's restart stream
// this is a small hack. After 65 seconds of silence, the stream will still throw an error for speech length limit
if (data.results[0] && data.results[0].isFinal) {
stopRecognitionStream();
startRecognitionStream(client);
// console.log('restarted stream serverside');
}
})
.on('end_of_single_utterance', (data) => {
process.stdout.write('data ended');
console.log('data ended');
})
;
}
Thank you in advance!

Related

Stream audio to Dialogflow with chunks from browser

We're doing some experimenting with Dialogflow and we've run into a complete stop for the time being. We're trying to set up a browser client that streams audio in chunks to Dialogflow via the node v2beta1 version of the dialogflow npm package. We followed the example to get it running and it works fine when we use the node server to pick up the sound via extra software (sox), but we want to stream from the browser. So we've set up a small code snippet that picks up the MediaStream from the mic.
When the data event is triggerend we get a chunk (an arraybuffer) that we, in chunks, pass to our node server.
On the server we've followed this example: https://cloud.google.com/dialogflow-enterprise/docs/detect-intent-stream#detect-intent-text-nodejs. The only thing we do different is instead of using pump to chain streams, we just write our chunks to the sessionsClient.
streamingDetectIntent().write({ inputAudio: [chunk] })
During experimentation we received several errors that we solved. But at this point we pass our chunks and receive empty responses, during and at the end.
Is this a valid way of passing audio to dialogflow, or do we really need to set up a stream? We do not want to use the node server as an entry, it needs to be the browser. We will have full control.
Client
import getUserMedia from 'get-user-media-promise';
import MicrophoneStream from 'microphone-stream';
export const startVoiceStream = () => {
const microphoneStream = new MicrophoneStream();
getUserMedia({ video: false, audio: true })
.then(function(micStream) {
microphoneStream.setStream(micStream);
socket.emit('startMicStream');
state.streamingMic = true;
setTimeout(() => {
// Just closing the stream on a timer for now
socket.emit('endMicStream');
}, 5000);
})
.catch(function(error) {
console.log(error);
});
microphoneStream.on('data', function(chunk) {
if (state.streamingMic) {
socket.emit('micStreamData', chunk);
}
});
};
Server code is much longer so I think I'll spare the details, but these are the main parts.
const initialStreamRequest = {
session: sessions.sessionPath,
queryParams: {
session: sessions.sessionPath, //TODO: try to delete
},
queryInput: {
audioConfig: {
audioEncoding: 'AUDIO_ENCODING_LINEAR_16',
sampleRateHertz: '16000',
languageCode: 'en-US',
},
singleUtterance: false
},
};
const startRecognitionStream = socketClient => {
streamIntent = sessions.sessionClient
.streamingDetectIntent()
.on('error', error => {
console.error({ error });
socketClient.emit('streamError', error);
})
.on('data', data => {
socketClient.emit('debug', { message: 'STREAM "ON DATA"', data });
if (data.recognitionResult) {
socketClient.emit(
'playerTranscript',
data.recognitionResult.transcript,
);
console.log(
`#Intermediate transcript : ${data.recognitionResult.transcript}`,
);
} else {
socketClient.emit('streamAudioResponse', data);
}
});
streamIntent.write(initialStreamRequest);
};
socket.on('micStreamData', data => {
if (streamIntent !== null) {
stop = true;
streamIntent.write({ inputAudio: data });
}
});

NodeJs Google Speech API Followed Streaming recognition

I am working on a project that will need to use streaming recognition. I want to make it work with NodeJs, like here. It works fine but as the documentation says, it stops after roughtly 60 seconds.
What I want is to restart the recording and the recognition right after this error happens, as my text might take more than one min.
I tried to close the record when I receive an error in my recognition:
const record = require('node-record-lpcm16');
const unirest = require('unirest');
const speech = require('#google-cloud/speech');
const client = new speech.SpeechClient();
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: false
};
const recognizeStream = client
.streamingRecognize(request)
.on('error', error => {
console.log("Timeout...");
console.error(error);
console.log("Closing record");
return record.stop();
})
.on('data', (data) => {
//Process data
});
And restarting the recording when it is ended:
record
.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
verbose: false,
recordProgram: 'rec',
silence: '1.0'
})
.on('error', console.error)
.on('end', data => {
record.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
verbose: false,
recordProgram: 'rec',
silence: '1.0'
}).pipe(recognizeStream);
})
.pipe(recognizeStream);
As a noobie in NodeJs it is the only solution I thought of but it doesn't work. The recording starts again correctly but the recognition doesn't as it doesn't transcript anything after it has been closed.
Any idea on how to perform a recognition for more than one minute with streaming recognition? I'd like something similar to what can be done in Python here

How to properly run the streamingRecognize() function provided by google cloud speech api

I have some issue with the streamingRecognize() function to stream speech to text.
When I run, I get the error:
Uncaught TypeError:
speechClient.streamingRecognize is not a function
When I try accessing it through the api object of my speechClient instance, I get this error as response:
google.cloud.speech.v1.StreamingRecognizeRequest#0 is not a field:
undefined
This is my code:
console.log('Listenning started')
document.getElementById("speak-btn").value = "Stop";
// retrieve settings
console.log("Retrieve audio and language settings...")
database.existSettingsRecord({}, (settingsRecord) => {
// The BCP-47 language code to use, e.g. 'en-US'
const languageCode = settingsRecord.language; //'en-US';
// Your Google Cloud Platform project ID
const nathProjectId = 'protocol-recorder-201707';
// Instantiates a client
const speechClient = Speech({
projectId: nathProjectId,
keyFilename: './rsc/credentials/spr-426ec2968cf6.json'
});
// The encoding of the audio file, e.g. 'LINEAR16'
const encoding = 'LINEAR16';
// The sample rate of the audio file in hertz, e.g. 16000
const sampleRateHertz = 16000;
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: false // If you want interim results, set this to true
};
// Create a recognize stream
var notes = '';
console.log('crate the recognize Stream object to be piped..')
//const recognizeStream = speechClient.createRecognizeStream(request)
console.log("speechClient : ",speechClient)
console.log("grpc : ",grpc)
const recognizeStream = speechClient.streamingRecognize(request)
.on('error', console.error)
.on('data', (response) => {
//process.stdout.write(response.results)
process.stdout.write(
(response.results[0] && response.results[0].alternatives[0])
? `Transcription: ${response.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`);
notes = document.getElementById("notes").value;
notes = notes + response.results;
document.getElementById("notes").value = notes;
});
// Start recording and send the microphone input to the Speech API
console.log('Start recording and send the microphone input to the Speech API..')
record.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
// Other options, see https://www.npmjs.com/package/node-record-lpcm16#options
verbose: true,
recordProgram: 'sox', // Try also "arecord" or "sox"
silence: '1.0',
device : settingsRecord.audio_input
})
.on('error', console.error)
.pipe(recognizeStream);
I am using :
Win 10
Node js 7.10.0
sox 14.4.2
Thanks for any help on that issues!

Why is the API response time for Audio Stream Recognition so slow?

I am using very similar code to Google's example for performing speech recognition on an Audio stream using the Node.js client library.
The API is parsing my audio correctly, but I find myself waiting 30-45 seconds before I get a response. Considering how snappy the demo is, this doesn't seem right. Is there something I am configuring incorrectly on my end?
I've tried writing to a local file instead just to make sure the audio is coming through clearly, and the recording seems fine.
Thanks for any help you can give!
import record from 'node-record-lpcm16';
import Speech from '#google-cloud/speech';
function streamToParser(){
const speech = Speech();
const request = {
config: {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'en-US',
},
interimResults: true,
};
const recognizeStream = speech.createRecognizeStream(request)
.on('error', console.error)
.on('data', (data) => {
console.log(data.results)
});
record
.start({
sampleRate: 16000,
threshold: .6,
verbose: true,
silence: '5.0'
})
.on('error', console.error)
.pipe(recognizeStream)
console.log('Google is listening...')
};
streamToParser();
Figured it out - I wasn't configuring Speech with auth credentials, so my requests must have been deprioritized. Here's the config that fixed it, as per the instructions here:
const speech = Speech({
projectId: 'my project ID from the Google Cloud dev console',
keyFilename: 'path/to/keyfile.json', // that I generated/downloaded from the Google Cloud dev console
});
To create a json keyfile, follow the steps outlined here in the "On Your Own Server" section.

Google speech API bad voice recognition when using SOX

I am trying to create very simple voice recognition soft with node js. I have hooked up Google speech API and can send over correctly recorded .wav file and get back transcription and the recognition is very good (Recorded with Audacity)
BUT I have issues getting voice recognition "on the fly", like send the audio stream directly from mic to Gooegle speech API.
Here is my main method that records voice and sends over to google.
function recognize(encoding, sampleRateHertz, languageCode)
{
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: true // If you want interim results, set this to true
};
// Imports the Google Cloud client library
const Speech = require('#google-cloud/speech');
// Instantiates a client
const speech = Speech();
// Create a recognize stream
const recognizeStream = speech.createRecognizeStream(request)
.on('error', console.error)
.on('data', (data) => process.stdout.write(data.results + ', '))
let fs = require('fs');
let Mic = require('node-microphone');
let mic = new Mic({ 'rate': '16000', 'channels': '1', 'debug': true, 'exitOnSilence': 6, 'bitwidth' : '16' });
let micStream = mic.startRecording();
micStream.pipe(recognizeStream);
micStream.pipe(fs.createWriteStream('test.wav') )
setTimeout(() => {
//logger.info('stopped recording');
console.log('stopped writing')
mic.stopRecording();
}, 10000);
mic.on('info', (info) => {
console.log('INFO ' + info);
});
mic.on('error', (error) => {
console.log(error);
});
}
And the config data I pass to method
options({
encoding: {
alias: 'e',
default: 'LINEAR16',
global: true,
requiresArg: true,
type: 'string'
},
sampleRateHertz: {
alias: 'r',
default: 16000,
global: true,
requiresArg: true,
type: 'number'
},
languageCode: {
alias: 'l',
default: 'en-US',
global: true,
requiresArg: true,
type: 'string'
}
})
So I use 'node-microphone' for recording, I have Windows and SOX is installed. Send it over google. I dont get errors, but recognition is VERY bad. I get transcription on very easy words or phrases like "who", "food", "call". Mostly, if I speak normally nothing is returned.
I have a feeling, that something with encodng is wrong, or recording rate(Like, the record is "too fast" and google does not understand), but I dont see my error.
I also added file saving. When I open the file and listen to it, it sounds normal. When I send THIS file for recongition I get almost nothing back. So, there is something wrong with the way audio stream is recorded
EDIT :
I am almost sure issues is in the SOX. All files recorded with some other program works much better.

Resources