I am working on a project that will need to use streaming recognition. I want to make it work with NodeJs, like here. It works fine but as the documentation says, it stops after roughtly 60 seconds.
What I want is to restart the recording and the recognition right after this error happens, as my text might take more than one min.
I tried to close the record when I receive an error in my recognition:
const record = require('node-record-lpcm16');
const unirest = require('unirest');
const speech = require('#google-cloud/speech');
const client = new speech.SpeechClient();
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: false
};
const recognizeStream = client
.streamingRecognize(request)
.on('error', error => {
console.log("Timeout...");
console.error(error);
console.log("Closing record");
return record.stop();
})
.on('data', (data) => {
//Process data
});
And restarting the recording when it is ended:
record
.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
verbose: false,
recordProgram: 'rec',
silence: '1.0'
})
.on('error', console.error)
.on('end', data => {
record.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
verbose: false,
recordProgram: 'rec',
silence: '1.0'
}).pipe(recognizeStream);
})
.pipe(recognizeStream);
As a noobie in NodeJs it is the only solution I thought of but it doesn't work. The recording starts again correctly but the recognition doesn't as it doesn't transcript anything after it has been closed.
Any idea on how to perform a recognition for more than one minute with streaming recognition? I'd like something similar to what can be done in Python here
Related
I'm trying live stream speech to text using Google. I have installed node into my server.
I have successfully implemented it but I want google to recognize when the user stops to speaking. Google explained how to do that using single_utterance=true but it is not taking effect. Can you please tell what issue is there in the below code. Thank you!
var request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
//profanityFilter: false,
enableWordTimeOffsets: true,
//single_utterance: true
// speechContexts: [{
// phrases: ["hoful","shwazil"]
// }] // add your own speech context for better recognition
},
interimResults: true, // If you want interim results, set this to true
singleUtterance: true
};
function startRecognitionStream(client, data) {
console.log(request);
recognizeStream = speechClient.streamingRecognize(request)
.on('error', console.error)
.on('data', (data) => {
process.stdout.write(
(data.results[0] && data.results[0].alternatives[0])
? `Transcription: ${data.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`);
client.emit('speechData', data);
// if end of utterance, let's restart stream
// this is a small hack. After 65 seconds of silence, the stream will still throw an error for speech length limit
if (data.results[0] && data.results[0].isFinal) {
stopRecognitionStream();
startRecognitionStream(client);
// console.log('restarted stream serverside');
}
})
.on('end_of_single_utterance', (data) => {
process.stdout.write('data ended');
console.log('data ended');
})
;
}
Thank you in advance!
https://cloud.google.com/speech-to-text/docs/streaming-recognize
I've been trying to execute the sample google speech api code under "Performing Streaming Speech Recognition on an Audio Stream"
Here is the code I have been trying to execute:
'use strict';
const record = require('node-record-lpcm16');
const speech = require('#google-cloud/speech');
const exec = require('child_process').exec;
//const speech = Speech();
const client = new speech.SpeechClient();
const encoding = 'LINEAR16';
const sampleRateHertz = 16000;
const languageCode = 'en-US';
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: true // If you want interim results, set this to true
};
const recognizeStream = client.streamingRecognize(request)
.on('error', console.error)
.on('data', (data) =>
process.stdout.write(
(data.results[0] && data.results[0].alternatives[0])
? `Transcription: ${data.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`)
);
record.start({
sampleRateHertz: sampleRateHertz,
threshold: 0.5,
verbose: true,
recordProgram: 'arecord', // Try also "arecord" or "sox"
silence: '10.0'
}).on('error', console.error)
.pipe(recognizeStream);
console.log('Listening, press Ctrl+C to stop.');
the output in the terminal:
the output in the terminal:
I realise there's a problem with the encoding of the output stream from arecord i.e. it isn't inline with the configuration that's been specified in the program, but I'm not sure what to do to correct this
I have some issue with the streamingRecognize() function to stream speech to text.
When I run, I get the error:
Uncaught TypeError:
speechClient.streamingRecognize is not a function
When I try accessing it through the api object of my speechClient instance, I get this error as response:
google.cloud.speech.v1.StreamingRecognizeRequest#0 is not a field:
undefined
This is my code:
console.log('Listenning started')
document.getElementById("speak-btn").value = "Stop";
// retrieve settings
console.log("Retrieve audio and language settings...")
database.existSettingsRecord({}, (settingsRecord) => {
// The BCP-47 language code to use, e.g. 'en-US'
const languageCode = settingsRecord.language; //'en-US';
// Your Google Cloud Platform project ID
const nathProjectId = 'protocol-recorder-201707';
// Instantiates a client
const speechClient = Speech({
projectId: nathProjectId,
keyFilename: './rsc/credentials/spr-426ec2968cf6.json'
});
// The encoding of the audio file, e.g. 'LINEAR16'
const encoding = 'LINEAR16';
// The sample rate of the audio file in hertz, e.g. 16000
const sampleRateHertz = 16000;
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: false // If you want interim results, set this to true
};
// Create a recognize stream
var notes = '';
console.log('crate the recognize Stream object to be piped..')
//const recognizeStream = speechClient.createRecognizeStream(request)
console.log("speechClient : ",speechClient)
console.log("grpc : ",grpc)
const recognizeStream = speechClient.streamingRecognize(request)
.on('error', console.error)
.on('data', (response) => {
//process.stdout.write(response.results)
process.stdout.write(
(response.results[0] && response.results[0].alternatives[0])
? `Transcription: ${response.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`);
notes = document.getElementById("notes").value;
notes = notes + response.results;
document.getElementById("notes").value = notes;
});
// Start recording and send the microphone input to the Speech API
console.log('Start recording and send the microphone input to the Speech API..')
record.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
// Other options, see https://www.npmjs.com/package/node-record-lpcm16#options
verbose: true,
recordProgram: 'sox', // Try also "arecord" or "sox"
silence: '1.0',
device : settingsRecord.audio_input
})
.on('error', console.error)
.pipe(recognizeStream);
I am using :
Win 10
Node js 7.10.0
sox 14.4.2
Thanks for any help on that issues!
I am using very similar code to Google's example for performing speech recognition on an Audio stream using the Node.js client library.
The API is parsing my audio correctly, but I find myself waiting 30-45 seconds before I get a response. Considering how snappy the demo is, this doesn't seem right. Is there something I am configuring incorrectly on my end?
I've tried writing to a local file instead just to make sure the audio is coming through clearly, and the recording seems fine.
Thanks for any help you can give!
import record from 'node-record-lpcm16';
import Speech from '#google-cloud/speech';
function streamToParser(){
const speech = Speech();
const request = {
config: {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'en-US',
},
interimResults: true,
};
const recognizeStream = speech.createRecognizeStream(request)
.on('error', console.error)
.on('data', (data) => {
console.log(data.results)
});
record
.start({
sampleRate: 16000,
threshold: .6,
verbose: true,
silence: '5.0'
})
.on('error', console.error)
.pipe(recognizeStream)
console.log('Google is listening...')
};
streamToParser();
Figured it out - I wasn't configuring Speech with auth credentials, so my requests must have been deprioritized. Here's the config that fixed it, as per the instructions here:
const speech = Speech({
projectId: 'my project ID from the Google Cloud dev console',
keyFilename: 'path/to/keyfile.json', // that I generated/downloaded from the Google Cloud dev console
});
To create a json keyfile, follow the steps outlined here in the "On Your Own Server" section.
I am trying to create very simple voice recognition soft with node js. I have hooked up Google speech API and can send over correctly recorded .wav file and get back transcription and the recognition is very good (Recorded with Audacity)
BUT I have issues getting voice recognition "on the fly", like send the audio stream directly from mic to Gooegle speech API.
Here is my main method that records voice and sends over to google.
function recognize(encoding, sampleRateHertz, languageCode)
{
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: true // If you want interim results, set this to true
};
// Imports the Google Cloud client library
const Speech = require('#google-cloud/speech');
// Instantiates a client
const speech = Speech();
// Create a recognize stream
const recognizeStream = speech.createRecognizeStream(request)
.on('error', console.error)
.on('data', (data) => process.stdout.write(data.results + ', '))
let fs = require('fs');
let Mic = require('node-microphone');
let mic = new Mic({ 'rate': '16000', 'channels': '1', 'debug': true, 'exitOnSilence': 6, 'bitwidth' : '16' });
let micStream = mic.startRecording();
micStream.pipe(recognizeStream);
micStream.pipe(fs.createWriteStream('test.wav') )
setTimeout(() => {
//logger.info('stopped recording');
console.log('stopped writing')
mic.stopRecording();
}, 10000);
mic.on('info', (info) => {
console.log('INFO ' + info);
});
mic.on('error', (error) => {
console.log(error);
});
}
And the config data I pass to method
options({
encoding: {
alias: 'e',
default: 'LINEAR16',
global: true,
requiresArg: true,
type: 'string'
},
sampleRateHertz: {
alias: 'r',
default: 16000,
global: true,
requiresArg: true,
type: 'number'
},
languageCode: {
alias: 'l',
default: 'en-US',
global: true,
requiresArg: true,
type: 'string'
}
})
So I use 'node-microphone' for recording, I have Windows and SOX is installed. Send it over google. I dont get errors, but recognition is VERY bad. I get transcription on very easy words or phrases like "who", "food", "call". Mostly, if I speak normally nothing is returned.
I have a feeling, that something with encodng is wrong, or recording rate(Like, the record is "too fast" and google does not understand), but I dont see my error.
I also added file saving. When I open the file and listen to it, it sounds normal. When I send THIS file for recongition I get almost nothing back. So, there is something wrong with the way audio stream is recorded
EDIT :
I am almost sure issues is in the SOX. All files recorded with some other program works much better.