I am using Google Cloud Speech API in Node.js. When I send a local audio file it gives the result but when I try to send a live stream it gets stopped within a second. Can anyone help me with this?
I am using this file: recognize.js
this is the code when I pass the audio file to google speech API. here it is working fine.I am passing the audio file stored in fileName sending into google speech API
function sample() {
const projectId = 'project id';
let file = "conf.json" //google exported this for you
var speech = require('#google-cloud/speech')({
projectId: 'project id',
keyFilename: 'Speech to text-a5ff6058e586.json'
});
const fs = require('fs');
const fileName = 'C:/Users/nap1225/Downloads/audio-files/audio_001.wav';
// const fileName = 'C:/xampp/htdocs/SpeechWatson/public/audio/Us_English_Broadband_Sample_2.wav';
// const fileName = 'Sample 1.wav';
// const fileName = 'C:/Users/nap1225/Desktop/dolo.wav';
// Reads a local audio file and converts it to base64
const fileMp3 = fs.readFileSync(fileName);
const audioBytes = fileMp3.toString('base64');
const audio = {
content: audioBytes
};
const config = {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'en-US',
speechContexts: {
"phrases": ["refsum"]
}
};
const request = {
audio: audio,
config: config
};
speech.recognize(request)
.then((results) => {
const transcription = results[0].results[0].alternatives[0].transcript;
console.log(`Transcription: `, transcription);
})
.catch((err) => {
console.error('ERROR:', err);
});
}
sample();
2.here is the code I am sending stream audio.when I execute this code in, it is recording for 1 second and get stopped
function streamingMicRecognize () {
// [START speech_streaming_mic_recognize]
// Imports the Google Cloud client library
const record = require('node-record-lpcm16');
const projectId = 'project id';
let file="conf.json"//google exported this for you
const speech = require('#google-cloud/speech')({
projectId: 'project id',
keyFilename: 'Speech to text-a5ff6058e586.json'
});
// Instantiates a client
//const speech = Speech();
// The encoding of the audio file, e.g. 'LINEAR16'
const encoding = 'LINEAR16';
// The sample rate of the audio file in hertz, e.g. 16000
const sampleRateHertz = 16000;
// The BCP-47 language code to use, e.g. 'en-US'
const languageCode = 'en-US';
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode
},
interimResults: true, // If you want interim results, set this to true
singleUtterance: false
};
// Create a recognize stream
const recognizeStream = speech.streamingRecognize(request)
.on('error', console.error)
.on('data', (data) =>
process.stdout.write(
(data.results[0] && data.results[0].alternatives[0])
? `Transcription: ${data.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`));
// Start recording and send the microphone input to the Speech API
record
.start({
sampleRateHertz: sampleRateHertz,
threshold: 20,
// Other options, see https://www.npmjs.com/package/node-record-lpcm16#options
verbose: false,
recordProgram: 'sox', // Try also "arecord" or "sox"
silence: '10.0',
device : 'plughw:0'
// device : settingsRecord.audio_input
})
.on('error', console.error)
.pipe(recognizeStream);
console.log('Listening, press Ctrl+C to stop.');
// [END speech_streaming_mic_recognize]
/* .command(
`listen`,
`Detects speech in a microphone input stream. This command requires that you have SoX installed and available in your $PATH. See https://www.npmjs.com/package/node-record-lpcm16#dependencies`,
{},
(opts) => streamingMicRecognize(opts.encoding, opts.sampleRateHertz, opts.languageCode)
)
.options({
encoding: {
alias: 'e',
default: 'LINEAR16',
global: true,
requiresArg: true,
type: 'string'
},
sampleRateHertz: {
alias: 'r',
default: 16000,
global: true,
requiresArg: true,
type: 'number'
},
languageCode: {
alias: 'l',
default: 'en-US',
global: true,
requiresArg: true,
type: 'string'
}
}) */
}
streamingMicRecognize();
Related
I have created app for speech to text converter. react frontend and nodejs API.i record audio from react and post it to nodejs.but google API result is empty.how can I fix it?
why getting always empty results?
that's my code.
ReactMic Recorder
<ReactMic
record={record}
className="sound-wave"
onStop={onStop}
onData={onData}
strokeColor="#000000"
backgroundColor="#FF4081"
mimeType="audio/wav"/>
<button onClick={startRecording} type="button">Start</button>
<button onClick={stopRecording} type="button">Stop</button>
NodeJs API
app.post('/SpeechConvert', (req, res) => {
const client = new speech.SpeechClient();
console.log(req.files.file);
req.files.file.mv('./input.wav',function (err) {
if (err) {
console.log(err);
}
})
async function speechToText() {
// The name of the audio file to transcribe
const fileData = req.files.file.data;
// Reads a local audio file and converts it to base64
const file = fs.readFileSync('input.wav');
const audioBytes = fileData.toString('base64');
// console.log(audioBytes);
// The audio file's encoding, sample rate in hertz, and BCP-47 language code
const audio = {
content: audioBytes,
};
const config = {
enableAutomaticPunctuation: true,
encoding: 'LINEAR16',
sampleRateHertz: 44100,
languageCode: 'en-US',
};
const request = {
audio: audio,
config: config,
};
// Detects speech in the audio file
const [response] = await client.recognize(request);
console.log(response);
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log(`Transcription: ${transcription}`);
res.send({ 'transcription': transcription, 'msg': 'The Audio successfully converted to the text' });
}
speechToText().catch(console.error);
});
can anyone help me to fix this?
I tried to use google cloud speech to text in my node.js project. I followed he quick startup guides from docs. I pass a audio file in wav and flac format but in both cases I receive an error: error TypeError: .google.cloud.speech.v1.RecognizeRequest.audio: object expected
try {
const speech = require('#google-cloud/speech');
const client = new speech.SpeechClient();
const filename = require('path').join(__dirname + '/../temp/test.wav');
const config = {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'en-US'
};
const audio = fs.readFileSync(filename).toString('base64');
const request = {
config: config,
audio: audio
};
const [response] = await client.recognize(request);
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log(`Transcription: ${transcription}`);
as you can see request is an object and when I consoled.log it it is properly converted toString. But the function client.recognize doesn't work. Where do you think is the problem?
The audio property in the request needs to be an object with the audioBytes as a property called content, like in the example below:
// Reads a local audio file and converts it to base64
const audioBytes = fs.readFileSync(fileName).toString('base64');
// The audio file's encoding, sample rate in hertz, and BCP-47 language code
const audio = {
content: audioBytes,
};
const request = {
audio: audio,
config: config,
};
I am making an app where the user browser records the user speaking and sends it to the server which then passes it on to the Google speech to the text interface. I am using mediaRecorder to get 1-second blobs which are sent to a server. On the server-side, I send these blobs over to the Google speech to the text interface. However, I am getting an empty transcriptions.
I know what the issue is. Mediarecorder's default Mime Type id audio/WebM codec=opus, which is not accepted by google's speech to text API. After doing some research, I realize I need to use ffmpeg to convert blobs to LInear16. However, ffmpeg only accepts audio FILES and I want to be able to convert BLOBS. Then I can send the resulting converted blobs over to the API interface.
server.js
wsserver.on('connection', socket => {
console.log("Listening on port 3002")
audio = {
content: null
}
socket.on('message',function(message){
// const buffer = new Int16Array(message, 0, Math.floor(data.byteLength / 2));
// console.log(`received from a client: ${new Uint8Array(message)}`);
// console.log(message);
audio.content = message.toString('base64')
console.log(audio.content);
livetranscriber.createRequest(audio).then(request => {
livetranscriber.recognizeStream(request);
});
});
});
livetranscriber
module.exports = {
createRequest: function(audio){
const encoding = 'LINEAR16';
const sampleRateHertz = 16000;
const languageCode = 'en-US';
return new Promise((resolve, reject, err) =>{
if (err){
reject(err)
}
else{
const request = {
audio: audio,
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
},
interimResults: false, // If you want interim results, set this to true
};
resolve(request);
}
});
},
recognizeStream: async function(request){
const [response] = await client.recognize(request)
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log(`Transcription: ${transcription}`);
// console.log(message);
// message.pipe(recognizeStream);
},
}
client
recorder.ondataavailable = function(e) {
console.log('Data', e.data);
var ws = new WebSocket('ws://localhost:3002/websocket');
ws.onopen = function() {
console.log("opening connection");
// const stream = websocketStream(ws)
// const duplex = WebSocket.createWebSocketStream(ws, { encoding: 'utf8' });
var blob = new Blob(e, { 'type' : 'audio/wav; base64' });
ws.send(blob.data);
// e.data).pipe(stream);
// console.log(e.data);
console.log("Sent the message")
};
// chunks.push(e.data);
// socket.emit('data', e.data);
}
I wrote a similar script several years ago. However, I used a JS frontend and a Python backend instead of NodeJS. I remember using a sox transformer to transform the audio input into to an output that the Google Speech API could use.
Perhaps this might be useful for you.
https://github.com/bitnahian/speech-transcriptor/blob/9f186e5416566aa8a6959fc1363d2e398b902822/app.py#L27
TLDR:
Converted from a .wav format to .raw format using ffmpeg and sox.
I am trying to send microphone input at the client(nuxt) side to the node + socket.io server and then to the google speech api. I am getting stream from navigator.mediaDevices.getUserMedia({ audio: true }) and send it to back end using socket.io-stream. My client side code as follows.
import ss from 'socket.io-stream'
navigator.mediaDevices.getUserMedia({ audio: true }).then((mediaStream) => {
ss(this.$socket).emit('audio', mediaStream);
});
And my server code as follows.
const io = require('socket.io')(3555);
const ss = require('socket.io-stream');
const speech = require('#google-cloud/speech');
io.on('connection', (socket) => {
const client = new speech.SpeechClient({ keyFilename: 'key.json' });
const encoding = 'LINEAR16';
const sampleRateHertz = 16000;
const languageCode = 'en-US';
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
},
interimResults: true,
};
ss(socket).on('audio', (stream) => {
const recognizeStream = client.streamingRecognize(request)
.on('error', console.error)
.on('data', data => {
process.stdout.write(
data.results[0] && data.results[0].alternatives[0]
? `Transcription: ${data.results[0].alternatives[0].transcript}\n`
: `\n\nReached transcription time limit, press Ctrl+C\n`
);
});
stream.pipe(recognizeStream);
});
});
But this code doesn't work and display the error TypeError: stream.pipe is not a function.
Someone please point out the error or tell me a way to achieve this. Thank you!
I'm trying to replicate the code given at https://github.com/googleapis/nodejs-speech/blob/master/samples/recognize.js. There is no error when I run it locally. But here I'm confused on where can I see the result that is created. Is there a way that I can write the result to a file?
Here is the code.
const record = require('node-record-lpcm16');
// Imports the Google Cloud client library
const speech = require('#google-cloud/speech');
// Creates a client
const client = new speech.SpeechClient();
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
const encoding = 'LINEAR16';
const sampleRateHertz = 16000;
const languageCode = 'en-US';
const request = {
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
},
interimResults: false, // If you want interim results, set this to true
};
// Create a recognize stream
const recognizeStream = client
.streamingRecognize(request)
.on('error', console.error)
.on('data', data =>
process.stdout.write(
data.results[0] && data.results[0].alternatives[0] ?
`Transcription: ${data.results[0].alternatives[0].transcript}\n` :
`\n\nReached transcription time limit, press Ctrl+C\n`
)
);
// Start recording and send the microphone input to the Speech API
record
.start({
sampleRateHertz: sampleRateHertz,
threshold: 0,
// Other options, see https://www.npmjs.com/package/node-record-lpcm16#options
verbose: false,
recordProgram: 'sox', // Try also "arecord" or "sox"
silence: '10.0',
})
.on('error', console.error)
.pipe(recognizeStream);
console.log('Listening, press Ctrl+C to stop.');
This is very confusing :(. please let me know how can I achieve this.
Thanks
It's in the "data". Please looking into the code and see how the console logs the data.
Example:
client
.recognize(request)
.then(data => {
const response = data[0];
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log(`Transcription: `, transcription);
})