How to wait for a stream to finish piping? (Nodejs) - node.js

I have a for loop array of promises, so I used Promise.all to go through them and called then afterwards.
let promises = [];
promises.push(promise1);
promises.push(promise2);
promises.push(promise3);
Promise.all(promises).then((responses) => {
for (let i = 0; i < promises.length; i++) {
if (promise.property === something) {
//do something
} else {
let file = fs.createWriteStream('./hello.pdf');
let stream = responses[i].pipe(file);
/*
I WANT THE PIPING AND THE FOLLOWING CODE
TO RUN BEFORE NEXT ITERATION OF FOR LOOP
*/
stream.on('finish', () => {
//extract the text out of the pdf
extract(filePath, {splitPages: false}, (err, text) => {
if (err) {
console.log(err);
} else {
arrayOfDocuments[i].text_contents = text;
}
});
});
}
}
promise1, promise2, and promise3 are some http requests, and if one of them is an application/pdf, then I write it to a stream and parse the text out of it. But this code runs the next iteration before parsing the test out of the pdf. Is there a way to make the code wait until the piping to the stream and extracting are finished before moving on to the next iteration?

Without async/await, it's quite nasty. With async/await, just do this:
Promise.all(promises).then(async (responses) => {
for (...) {
await new Promise(fulfill => stream.on("finish", fulfill));
//extract the text out of the PDF
}
})

Something like the following would also work. I use this pattern fairly often:
let promises = [];
promises.push(promise1);
promises.push(promise2);
promises.push(promise3);
function doNext(){
if(!promises.length) return;
promises.shift().then((resolved) =>{
if(resolved.property === something){
...
doNext();
}else{
let file = fs.createWriteStream('./hello.pdf');
let stream = resolved.pipe(file);
stream.on('finish', () =>{
...
doNext();
});
}
})
}
doNext();
or break up the handler to a controller and Promisified handler:
function streamOrNot(obj){
return new Promise(resolve, reject){
if(obj.property === something){
resolve();
return;
}
let file = fs.createWriteStream...;
stream.on('finish', () =>{
...
resolve();
});
}
}
function doNext(){
if(!promises.length) return;
return promises.shift().then(streamOrNot).then(doNext);
}
doNext()

Use await with stream.pipeline() instead of stream.pipe():
import * as StreamPromises from "stream/promises";
...
await StreamPromises.pipeline(sourceStream, destinationStream);

You can write the else part inside a self invoked function. So that the handling of stream will happen in parallel
(function(i) {
let file = fs.createWriteStream('./hello.pdf');
let stream = responses[i].pipe(file);
/*
I WANT THE PIPING AND THE FOLLOWING CODE
TO RUN BEFORE NEXT ITERATION OF FOR LOOP
*/
stream.on('finish', () => {
//extract the text out of the pdf
extract(filePath, {splitPages: false}, (err, text) => {
if (err) {
console.log(err);
}
else {
arrayOfDocuments[i].text_contents = text;
}
});
});
})(i)
Else you can handle the streaming part as part of the original/individual promise itself.
As of now you are creating the promise and adding it to array, instead of that you add promise.then to the array(which is also a promise). And inside the handler to then you do your streaming stuff.

Related

Node JS - createWriteStream

I am going crazy trying to fix this bug so please help :-)
I am using https://pdfkit.org/
This creates a stream that when finished is piped to fs.createWriteStream
My issue is the first time the code runs this works and the PDF is generated.
The next time the Code runs a file with Zero Bytes is created.
I am calling the function from an API running on express.
The issue appears to be the async nature of fs.createWriteStream.
The stream finishes after the API has returned. I cannnot seem to find a way to block while confirming the file has been created.
What is odd is that the first time the code works run again it fails:
Here is the Pipe Function;
async function _writeFile(fileObj) {
let fileStream = fs.createWriteStream(fileObj.fileName)
pipeline(
doc,
fileStream,
async (err) => {
if (err) {
console.error('PDF failed', err);
return ('Pipeline failed', err)
} else {
console.log('PDF succeeded');
}
}
)
}
This is called from:
exports.drawReport = async (payload) => {
var date = new Date();
const timeStamp = date.toJSON();
let path = './controllers/tmp/'
var fileName = path + timeStamp + '.' + payload.type + '.pdf'
try {
// Start Report
await _startReport(payload)
// Check Starting position on page & add status box header
if (device_card_reference == 260) {
await _deviceTitle(payload);
}
// Add Devices
await _reportDevice(payload);
// Call Footer for final page
await _reportFooter()
console.log("PDF Done - Writing File")
// File Meta Data
let fileObj = {
type: payload.type,
siteId: payload.siteId,
fileName: fileName,
timeStamp: timeStamp
}
// Create file to store PDF
await _writeFile(fileObj)
doc.end()
console.log("PDF MADE?")
return (fileObj)
} catch (err) {
console.error('MakePDF ERROR: ' + err.message);
return (err.message)
}
}
pipeline runs asynchronously, so it's not awaited, which is why doc.end() runs before the file is done
try wrapping pipeline in a promise, and then resolve when the stream is done:
// function that returns a promise
function _writeFile(fileObj) {
return new Promise((resolve, reject) => {
const fileStream = fs.createWriteStream(fileObj.fileName);
pipeline(
doc,
fileStream,
async(err) => {
if (err) {
console.error('PDF failed', err);
// err, handle in `.catch`
reject({res:'Pipeline failed', err});
} else {
console.log('PDF succeeded');
// done, resolve, to move to doc.end
resolve('PDF succeeded');
}
}
)
});
}
add .catch() to handle error:
// Create file to store PDF
await _writeFile(fileObj).catch(err => console.log(err));
or even better, use stream promises API
const {pipeline } = require('stream/promises');
async function _writeFile(fileObj) {
const fileStream = fs.createWriteStream(fileObj.fileName);
await pipeline(doc, fileStream);
console.log('PDF succeeded');
}

why the for loop doesn't want to wait for the process to finish on a function at node js?

I am trying to create a script to download pages from multiple urls using node js but the loop didn't want to wait for the request to finish and continued printing, I also got a hint to use the async for loop, but still it didn't work.
here's my code
function GetPage(url){
console.log(` Downloading page ${url}`);
request({
url: `${url}`
},(err,res,body) => {
if(err) throw err;
console.log(` Writing html to file` );
fs.writeFile(`${url.split('/').slice(-1)[0]}`,`${body}`,(err) => {
if(err) throw err;
console.log('saved');
});
});
}
var list = [ 'https://www.someurl.com/page1.html', 'https://www.someurl.com/page2.html', 'https://www.someurl.com/page3.html' ]
const main = async () => {
for(let i = 0; i < list.length; i++){
console.log(` processing ${list[i]}`);
await GetPage(list[i]);
}
};
main().catch(console.error);
Output :
processing https://www.someurl.com/page1.html
Downloading page https://www.someurl.com/page1.html
processing https://www.someurl.com/page2.html
Downloading page https://www.someurl.com/page2.html
processing https://www.someurl.com/page3.html
Downloading page https://www.someurl.com/page3.html
Writing html to file
Writing html to file
saved
saved
Writing html to file
saved
There are a couple of problems with your code.
You are mixing code that uses the callback style programming and code that should be using promises. Also, your getPage function is not async (it doesn't return a promise) so you cannot await on it.
You just have to return a promise from your getPage() function, and correctly resolve it or reject it.
function getPage(url) {
return new Promise((resolve, reject) => {
console.log(` Downloading page ${url}`);
request({ url: `${url}` }, (err, res, body) => {
if (err) reject(err);
console.log(` Writing html to file`);
fs.writeFile(`${url.replace(/\//g,'-')}.html`, `${body}`, (writeErr) => {
if (writeErr) reject(writeErr);
console.log("saved");
resolve();
});
});
});
}
You don't have to change your main() function loop will await for the getPage() function.
For loop doesn't wait for callback to be finished, it will continue executing it. You need to turn either getPage function to promise or use Promise.all as shown below.
var list = [
"https://www.someurl.com/page1.html",
"https://www.someurl.com/page2.html",
"https://www.someurl.com/page3.html",
];
function getPage(url) {
return new Promise((resolve, reject) => {
console.log(` Downloading page ${url}`);
request({ url: `${url}` }, (err, res, body) => {
if (err) reject(err);
console.log(` Writing html to file`);
fs.writeFile(`${url}.html`, `${body}`, (writeErr) => {
if (writeErr) reject(writeErr);
console.log("saved");
resolve();
});
});
});
}
const main = async () => {
return new Promise((resolve, reject) => {
let promises = [];
list.map((path) => promises.push(getPage(path)));
Promise.all(promises).then(resolve).catch(reject);
});
};
main().catch(console.error);
GetPage() is not built around promises and doesn't even return a promise so await on its result does NOTHING. await has no magic powers. It awaits a promise. If you don't give it a promise that properly resolves/rejects when your async operation is done, then the await does nothing. Your GetPage() function returns nothing so the await has nothing to do.
What you need is to fix GetPage() so it returns a promise that is properly tied to your asynchronous result. Because the request() library has been deprecated and is no longer recommended for new projects and because you need a promise-based solution anyway so you can use await with it, I'd suggest you switch to one of the alternative promise-based libraries recommended here. My favorite from that list is got(), but you can choose whichever one you like best. In addition, you can use fs.promises.writeFile() for promise-based file writing.
Here's how that code would look like using got():
const got = require('got');
const { URL } = require('url');
const path = require('path');
const fs = require('fs');
function getPage(url) {
console.log(` Downloading page ${url}`);
return got(url).text().then(data => {
// can't just use an URL for your filename as it contains potentially illegal
// characters for the file system
// so, add some code to create a sanitized filename here
// find just the root filename in the URL
let urlObj = new URL(url);
let filename = path.basename(urlObj.pathname);
if (!filename) {
filename = "index.html";
}
let extension = path.extname(filename);
if (!extension) {
filename += ".html";
} else if (extension === ".") {
filename += "html";
}
console.log(` Writing file ${filename}`)
return fs.promises.writeFile(filename, data);
});
}
const list = ['https://www.someurl.com/page1.html', 'https://www.someurl.com/page2.html', 'https://www.someurl.com/page3.html'];
async function main() {
for (let url of list) {
console.log(` processing ${url}`);
await getPage(url);
}
}
main().then(() => {
console.log("all done");
}).catch(console.error);
If you put real URLs in the array, this is directly runnable in nodejs. I ran it myself with my own URLs.
Summary of Changes and Improvements:
Switched from request() to got() because it's promise-based and not deprecated.
Modified getPage() to return a promise that represents the asynchronous operations in the function.
Switched to fs.promises.writeFile() so we are using only promises for asynchronous control-flow.
Added legal filename generation from the base path of the URL since you can't just use a full URL as a filename (at least in some file systems).
Switched to simpler for/of loop

Break from async function in await call

Okay, so I'm working with data from Memcache using a promise based library but the issue I'm having is I don't know a way to break from the async call if a result is found?
The code I'm working with is:
const _pong = function() {
return socket.emit('aye', {
pong: globals.uuid()
});
};
return socket.on('helo', async function(data) {
socket._uuid = data.uuid;
let key = 'ws-ping:' + data.uuid;
await cache.get(key).then((result) => {
if(result !== undefined) {
_pong();
}
});
......
});
I basically need to just ignore the rest of the socket.on function if a result is found using the given key? but it seems to continue?
Because you're using await, you can ditch the .then, and get the result directly, in the same block - if the result exists, then just return (after _ponging, if that's the logic you're looking for):
return socket.on('helo', async function(data) {
socket._uuid = data.uuid;
let key = 'ws-ping:' + data.uuid;
const result = await cache.get(key);
if (result !== undefined) {
_pong();
return;
}
// ...
});

For loop in promise.then()?

I need to iterate between two values and create/touch files (I/O) on each iteration.
I'm using the fs-promise module to do so asynchronously:
const path = require('path');
const fsp = require('fs-promise');
function addPages(startAt, pages, mode) {
let htmlExt = mode.HTML;
let cssExt = mode.CSS;
fsp.readFile(path.join('.', 'templates', 'body.html'), { encoding: 'utf-8' })
.then((content) => {
// return Promise.all(() => {}).then().catch(); // Do this.
for (let i = startAt, endAt = startAt + pages; i < endAt; i++) {
console.log(i);
fsp.writeFile(path.join('.', 'manuscript', `page-${i}`, `style.${cssExt}`), '')
.then(() => { console.log('Yay!') })
.catch(console.log.bind(console));
// fsp.writeFile(path.join('.', 'manuscript', `page-${i}`, `style.${cssExt}`), '')
// .then((i, templateHTML) => {
// fsp.writeFile(path.join('.', 'manuscript', `page-${i}`, `body.${htmlExt}`), content);
// })
// .catch((err) => {
// console.log.bind(console);
// });
}
})
.catch((err) => {
if (err) return error('Couldn\'t create pages', err);
});
Now I did read that Promises.all([Array of promises]) is the way to go for looping inside the then() scope, but the question is why/how?
I'm unable to wrap my head around why the for-loop doesn't execute before the context moves out of the promised then() scope, and then how should I get to the expected outcome.
const path = require('path');
const fsp = require('fs-promise');
function addPages(startAt, pages, mode) {
let htmlExt = mode.HTML;
let cssExt = mode.CSS;
return fsp.readFile(path.join('.', 'templates', 'body.html'), { encoding: 'utf-8' })
.then((content) => {
var pendingWrites = [];
for (let i = startAt, endAt = startAt + pages; i < endAt; i++) {
let filename = path.join('.', 'manuscript', `page-${i}`, `style.${cssExt}`);
let thisWrite = fsp.writeFile(filename, '');
pendingWrites.push(thisWrite);
}
return Promise.all(pendingWrites);
})
.catch((err) => {
// either fully recover from the error or rethrow
console.log("Could not add pages: ", err);
throw err;
});
}
As elaborated in the comments, resist the temptation to introduce none-functional .catch() handlers into your promise chain.
Non-functional means in this case: It does not recover from the error and does not rethrow the error. A catch handler that does not throw marks an error as handled, i.e. it returns a resolved promise, not a rejected one. This makes proper error handling later in the promise chain impossible. It's bad practice and unhelpful.
If you want to log the error, log it and rethrow it. If you have fully recovered from the error and subsequent code is unimpeded, don't rethrow.

Async/Await not waiting

I'm running into an issue which I don't fully understand. I feel like there are likely concepts which I haven't grasped, code that could be optimized, and possibly a bug thrown in for good measure.
To greatly simplify the overall flow:
A request is made to an external API
The returned JSON object is parsed and scanned for link references
If any link references are found, additional requests are made to populate/replace link references with real JSON data
Once all link references have been replaced, the original request is returned and used to build content
Here, is the original request (#1):
await Store.get(Constants.Contentful.ENTRY, Contentful[page.file])
Store.get is represented by:
async get(type, id) {
return await this._get(type, id);
}
Which calls:
_get(type, id) {
return new Promise(async (resolve, reject) => {
var data = _json[id] = _json[id] || await this._api(type, id);
console.log(data)
if(isAsset(data)) {
resolve(data);
} else if(isEntry(data)) {
await this._scan(data);
resolve(data);
} else {
const error = 'Response is not entry/asset.';
console.log(error);
reject(error);
}
});
}
The API call is:
_api(type, id) {
return new Promise((resolve, reject) => {
Request('http://cdn.contentful.com/spaces/' + Constants.Contentful.SPACE + '/' + (!type || type === Constants.Contentful.ENTRY ? 'entries' : 'assets') + '/' + id + '?access_token=' + Constants.Contentful.PRODUCTION_TOKEN, (error, response, data) => {
if(error) {
console.log(error);
reject(error);
} else {
data = JSON.parse(data);
if(data.sys.type === Constants.Contentful.ERROR) {
console.log(data);
reject(data);
} else {
resolve(data);
}
}
});
});
}
When an entry is returned, it is scanned:
_scan(data) {
return new Promise((resolve, reject) => {
if(data && data.fields) {
const keys = Object.keys(data.fields);
keys.forEach(async (key, i) => {
var val = data.fields[key];
if(isLink(val)) {
var child = await this._get(val.sys.linkType.toUpperCase(), val.sys.id);
this._inject(data.fields, key, undefined, child);
} else if(isLinkArray(val)) {
var children = await* val.map(async (link) => await this._get(link.sys.linkType.toUpperCase(), link.sys.id));
children.forEach((child, index) => {
this._inject(data.fields, key, index, child);
});
} else {
await new Promise((resolve) => setTimeout(resolve, 0));
}
if(i === keys.length - 1) {
resolve();
}
});
} else {
const error = 'Required data is unavailable.';
console.log(error);
reject(error);
}
});
}
If link references are found, additional requests are made and then the resulting JSON is injected into the original JSON in place of the reference:
_inject(fields, key, index, data) {
if(isNaN(index)) {
fields[key] = data;
} else {
fields[key][index] = data;
}
}
Notice, I'm using async, await, and Promise's I believe in their intended manor. What ends up happening: The calls for referenced data (gets resulting of _scan) end up occurring after the original request is returned. This ends up providing incomplete data to the content template.
Additional information concerning my build setup:
npm#2.14.2
node#4.0.0
webpack#1.12.2
babel#5.8.34
babel-loader#5.4.0
I believe the issue is in your forEach call in _scan. For reference, see this passage in Taming the asynchronous beast with ES7:
However, if you try to use an async function, then you will get a more subtle bug:
let docs = [{}, {}, {}];
// WARNING: this won't work
docs.forEach(async function (doc, i) {
await db.post(doc);
console.log(i);
});
console.log('main loop done');
This will compile, but the problem is that this will print out:
main loop done
0
1
2
What's happening is that the main function is exiting early, because the await is actually in the sub-function. Furthermore, this will execute each promise concurrently, which is not what we intended.
The lesson is: be careful when you have any function inside your async function. The await will only pause its parent function, so check that it's doing what you actually think it's doing.
So each iteration of the forEach call is running concurrently; they're not executing one at a time. As soon as the one that matches the criteria i === keys.length - 1 finishes, the promise is resolved and _scan returns, even though other async functions called via forEach are still executing.
You would need to either change the forEach to a map to return an array of promises, which you can then await* from _scan (if you want to execute them all concurrently and then call something when they're all done), or execute them one-at-a-time if you want them to execute in sequence.
As a side note, if I'm reading them right, some of your async functions can be simplified a bit; remember that, while awaiting an async function call returns a value, simply calling it returns another promise, and returning a value from an async function is the same as returning a promise that resolves to that value in a non-async function. So, for example, _get can be:
async _get(type, id) {
var data = _json[id] = _json[id] || await this._api(type, id);
console.log(data)
if (isAsset(data)) {
return data;
} else if (isEntry(data)) {
await this._scan(data);
return data;
} else {
const error = 'Response is not entry/asset.';
console.log(error);
throw error;
}
}
Similarly, _scan could be (assuming you want the forEach bodies to execute concurrently):
async _scan(data) {
if (data && data.fields) {
const keys = Object.keys(data.fields);
const promises = keys.map(async (key, i) => {
var val = data.fields[key];
if (isLink(val)) {
var child = await this._get(val.sys.linkType.toUpperCase(), val.sys.id);
this._inject(data.fields, key, undefined, child);
} else if (isLinkArray(val)) {
var children = await* val.map(async (link) => await this._get(link.sys.linkType.toUpperCase(), link.sys.id));
children.forEach((child, index) => {
this._inject(data.fields, key, index, child);
});
} else {
await new Promise((resolve) => setTimeout(resolve, 0));
}
});
await* promises;
} else {
const error = 'Required data is unavailable.';
console.log(error);
throw error;
}
}

Resources