Having difficulties with node.js res.send() loop - node.js

I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green

I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Related

node's module function return value empty/undefined?

I'm trying to get the html encoded table row value, returned from the slqLite based logger. As I'm new to node modules I'm stuck at:
var sqlite3 = require('sqlite3').verbose();
var db = new sqlite3.Database(':memory:');
var html = '';
module.exports = {
readHtml: function() {
var html = ''; // optional but does not work here as well
db.serialize(function() {
db.each("SELECT rowid AS id, info FROM logger", function(err, row) {
html = html + '<tr><td>' + row.info + '<td><tr>'; << html is growing
console.log('Log: ' + row.info); << working
});
});
console.log(html); // html gets empty here!
return html;
}
}
So have no value returned from:
var sysLog = require('logger');
sysLog.init();
sysLog.write('test string1');
sysLog.write('test string2');
console.log(sysLog.readHtml());
It has to be very simple to be solved ...
node is 6.7
You problem is directly related to a very common issue when starting with JavaScript:
How do I return the response from an asynchronous call?
Which shows the simplest way to receive results of an asynchronous operation, such as db.each is using a callback.
function readHtml()
var html = ''
db.serialize(function() {
db.each(..., function(err, row) {
// this line executes sometime later
// after we already returned from readHtml()
});
});
// this line executes right after the call to db.serialize
// but before db.each calls the callback we give to it.
// At this point, html is '' because we still didn't read any rows
// (they are read asynchronously, sometime later)
return html;
}
readHtml(); // <-- this is always '' because rows are read at a later point
To solve this, you would need a function that will be called with a callback like this:
readHtml(function(html) { // <-- this callback gets called once all rows are read
console.log(html);
});
Your situation also has an additional complication that db.each calls its callback once for every row. By looking at the docs, you can see that db.each accepts an additional complete callback when all rows are read. You can use this callback to signalize reading is done and pass the html results.
Here's how you can define readHtml:
function readHtml(callback) { // pass in a callback to call once all rows are read and all html is accumulated
var html = '';
db.serialize(function() {
// read all the rows and accumulate html as before
db.each("SELECT rowid AS id, info FROM logger", function(err, row) {
html = html + '<tr><td>' + row.info + '<td><tr>';
}, function() {
callback(html); // use the second callback to signal you are done and pass the html back
});
});
}

Node.js: given array of URLs, determine which are valid

I am a total scrub with the node http module and having some trouble.
The ultimate goal here is to take a huge list of urls, figure out which are valid and then scrape those pages for certain data. So step one is figuring out if a URL is valid and this simple exercise is baffling me.
say we have an array allURLs:
["www.yahoo.com", "www.stackoverflow.com", "www.sdfhksdjfksjdhg.net"]
The goal is to iterate this array, make a get request to each and if a response comes in, add the link to a list of workingURLs (for now just another array), else it goes to a list brokenURLs.
var workingURLs = [];
var brokenURLs = [];
for (var i = 0; i < allURLs.length; i++) {
var url = allURLs[i];
var req = http.get(url, function (res) {
if (res) {
workingURLs.push(?????); // How to derive URL from response?
}
});
req.on('error', function (e) {
brokenURLs.push(e.host);
});
}
what I don't know is how to properly obtain the url from the request/ response object itself, or really how to structure this kind of async code - because again, I am a nodejs scrub :(
For most websites using res.headers.location works, but there are times when the headers do not have this property and that will cause problems for me later on. Also I've tried console logging the response object itself and that was a messy and fruitless endeavor
I have tried pushing the url variable to workingURLs, but by the time any response comes back that would trigger the push, the for loop is already over and url is forever pointing to the final element of the allURLs array.
Thanks to anyone who can help
You need to closure url value to have access to it and protect it from changes on next loop iteration.
For example:
(function(url){
// use url here
})(allUrls[i]);
Most simple solution for this is use forEach instead of for.
allURLs.forEach(function(url){
//....
});
Promisified solution allows you to get a moment when work is done:
var http = require('http');
var allURLs = [
"http://www.yahoo.com/",
"http://www.stackoverflow.com/",
"http://www.sdfhksdjfksjdhg.net/"
];
var workingURLs = [];
var brokenURLs = [];
var promises = allURLs.map(url => validateUrl(url)
.then(res => (res?workingURLs:brokenURLs).push(url)));
Promise.all(promises).then(() => {
console.log(workingURLs, brokenURLs);
});
// ----
function validateUrl(url) {
return new Promise((ok, fail) => {
http.get(url, res => return ok(res.statusCode == 200))
.on('error', e => ok(false));
});
}
// Prevent nodejs from exit, don't need if any server listen.
var t = setTimeout(() => { console.log('Time is over'); }, 1000).ref();
You can use something like this (Not tested):
const arr = ["", "/a", "", ""];
Promise.all(arr.map(fetch)
.then(responses=>responses.filter(res=> res.ok).map(res=>res.url))
.then(workingUrls=>{
console.log(workingUrls);
console.log(arr.filter(url=> workingUrls.indexOf(url) == -1 ))
});
EDITED
Working fiddle (Note that you can't do request to another site in the browser because of Cross domain).
UPDATED with #vp_arth suggestions
const arr = ["/", "/a", "/", "/"];
let working=[], notWorking=[],
find = url=> fetch(url)
.then(res=> res.ok ?
working.push(res.url) && res : notWorking.push(res.url) && res);
Promise.all(arr.map(find))
.then(responses=>{
console.log('woking', working, 'notWorking', notWorking);
/* Do whatever with the responses if needed */
});
Fiddle

EventEmitter memory leak detected: Proper way to pass CSV data to multiple modules?

I am dipping my toe into using different npm modules my own way whereas before I just executed already created gulpfiles. The npm module penthouse loads a webpage and determines the above the fold CSS for that page. I am trying to take that module and use it with a site crawler so I can get the above the fold css for all pages, and store that CSS in a table.
So essentially I am:
Crawling a site to get all the urls
capturing the page id from each url
storing pages & their id's in a CSV
load the CSV and pass each URL to penthouse
take penthouse output and store it in a table
So I am fine up until the last two steps. When I am reading the CSV, I get the error possible EventEmitter memory leak detected. 11 exit listeners added. Use emitter.setMaxListeners() to increase limit.
The stack trace points here at line 134. After reading about the error, it makes sense because I see a bunch of event listeners being added, but I don't see penthouse ever really executing and closing the event listeners.
It works just fine standalone as expected (Running penthouse against a single page then exiting). But when I execute the code below to try and loop through all URLs in a csv, it spits out the memory leak error twice, and just hangs. None of my console.log statements in the following script are executed.
However, I added console.log to the end of the penthouse index.js file, and it is executed multiple times (where it adds event listeners), but it never timeouts or exits.
So it's clear I am not integrating this properly, but not sure how to proceed. What would be the best way to force it to read one line in the CSV at a time, process the URL, then take the output and store it in the DB before moving onto the next line?
const fs = require('fs');
var csv = require('fast-csv');
var penthouse = require('penthouse'),
path = require('path');
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var csvStream = csv()
//returns single line from CSV
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
penthouse({
url : data[2],
css : './dist/styles/main.css'
}, function(err, criticalCss) {
if (err) {
console.log(err);
}
console.log('do we ever get here?'); //answer is no
if (data[1] === 'post') {
wp.posts().id( data[0] ).post({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved to db');
});
} else {
wp.pages().id( data[0] ).page({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved to db');
});
}
});
})
.on("end", function(){
console.log("done");
});
return stream.pipe(csvStream);
};
UPDATE
Changed my method to look like below so it processes all rows first, but still throws the same error. Writes "done" to the console, and immediately spits out the memory warning twice.
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var urls = [];
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done");
buildCriticalCss(urls);
});
return stream.pipe(csvStream);
};
var buildCriticalCss = function(urls) {
//console.log(urls);
urls.forEach(function(data, idx) {
//console.log(data);
penthouse({
url : data[2],
css : './dist/styles/main.css',
// OPTIONAL params
width : 1300, // viewport width
height : 900, // viewport height
timeout: 30000, // ms; abort critical css generation after this timeout
strict: false, // set to true to throw on css errors (will run faster if no errors)
maxEmbeddedBase64Length: 1000 // charaters; strip out inline base64 encoded resources larger than this
}, function(err, criticalCss) {
if (err) {
console.log(err);
}
console.log('do we ever finish one?');
if (data[1] === 'post') {
console.log('saving post ' + data[0]);
wp.posts().id( data[0] ).post({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved post to db');
});
} else {
console.log('saving page ' + data[0]);
wp.pages().id( data[0] ).page({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved page to db');
});
}
});
});
};
Update 2
I took the simple approach to control the amount of concurrent processes spawned.
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var urls = [];
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done");
//console.log(urls);
buildCriticalCss(urls);
});
return stream.pipe(csvStream);
};
function buildCriticalCss(data) {
var row = data.shift();
console.log(row);
penthouse({
url : row[2],
css : './dist/styles/main.css',
// OPTIONAL params
width : 1300, // viewport width
height : 900, // viewport height
timeout: 30000, // ms; abort critical css generation after this timeout
strict: false, // set to true to throw on css errors (will run faster if no errors)
maxEmbeddedBase64Length: 1000 // charaters; strip out inline base64 encoded resources larger than this
}, function(err, criticalCss) {
if (err) {
console.log('err');
}
// handle your criticalCSS
console.log('finished');
console.log(row[2]);
// now start next job, if we have more urls
if (data.length !== 0) {
buildCriticalCss(data);
}
});
}
The error message you're seeing is a default printed to the console by node's event library if more than the allowed number of event listeners are defined for an instance of EventEmitter. It does not indicate an actual memory leak. Rather it is displayed to make sure you're aware of the possibility of a leak.
You can see this by checking the event.EventEmitter source code at lines 20 and 244.
To stop EventEmitter from displaying this message and since penthouse does not expose its specific EventEmitter, you'll need to set the default allowed event emitter listeners to something larger than its default value of 10 using:
var EventEmitter=require('event').EventEmitter;
EventEmitter.defaultMaxListeners=20;
Note that according to Node's documentation for EventEmitter.defaultMaxListeners, this will change the maximum number of listeners for all instances of EventEmitter, including those that have already been defined previous to the change.
Or you could simply ignore the message.
Further to the hanging of your code, I'd advise gathering all the results from the parsing of your CSV into an array, and then processing the array contents separately from the parsing process.
This would accomplish two things: It would allow you to
be assured the entire CSV file was valid before you started processing, and
instrument debugging messages while processing each element, which would give you deeper insight into how each element of the array was processed.
UPDATE
As noted below, depending on how many URLs you're processing, you're probably overwhelming Node's ability to handle all of your requests in parallel.
One easy way to proceed would be to use eventing to marshall your processing so your URLs are processed sequentially, as in:
var assert=require('assert'),
event=require('events'),
fs=require('fs'),
csv=require('fast-csv');
penthouse=require('penthouse');
var emitter=new events.EventEmitter();
/** Container for URL records read from CSV file.
*
* #type {Array}
*/
var urls=[];
/** Reads urls from file and triggers processing
*
* #emits processUrl
*/
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
stream.on('error',function(e){ // always handle errors!!
console.error('failed to createReadStream: %s',e);
process.exit(-1);
});
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done reading csv");
//console.log(urls);
emitter.emit('processUrl'); // start processing URLs
})
.on('error',function(e){
console.error('failed to parse CSV: %s',e);
process.exit(-1);
});
// no return required since we don't do anything with the result
stream.pipe(csvStream);
};
/** Event handler to process a single URL
*
* #emits processUrl
*/
var onProcessUrl=function(){
// always check your assumptions
assert(Array.isArray(urls),'urls must be an array');
var urlRecord=urls.shift();
if(urlRecord){
assert(Array.isArray(urlRecord),'urlRecord must be an array');
assert(urlRecord.length>2,'urlRecord must have at least three elements');
penthouse(
{
// ...
},
function(e,criticalCss){
if(e){
console.error('failed to process record %s: %s',urlRecord,e);
return; // IMPORTANT! do not drop through to rest of func!
}
// do what you need with the result here
if(urls.length===0){ // ok, we're done
console.log('completed processing URLs');
return;
}
emitter.emit('processUrl');
}
);
}
}
/**
* processUrl event - triggers processing of next URL
*
* #event processUrl
*/
emitter.on('processUrl',onProcessUrl); // assign handler
// start everything going...
readUrlCsv();
The benefit of using events here rather than your solution is the lack of recursion which can easily overwhelm your stack.
Hint: You can use events to handle all program flow issues normally addressed by Promises or modules like async.
And since events are at the very heart of Node (the "event loop"), it's really the best, most efficient way to solve such problems.
It's both elegant and "The Node Way"!
Here is a gist that illustrates the technique, without relying on streams or penthouse, the output of which is:
url: url1
RESULT: RESULT FOR url1
url: url2
RESULT: RESULT FOR url2
url: url3
RESULT: RESULT FOR url3
completed processing URLs
Besides using console.logs which usually is enough, you can also use the built in debugger: https://nodejs.org/api/debugger.html
Another thing you can do is go into the node_modules/penthouse directory and add your console.logs or debugger statement into the code for that module. That way you can debug your program there rather than the module just being a black box.
Also make sure there isn't some kind of race condition where for example the CSV doesn't always get output before it tries to read them in.
I think that the memory leak issue is probably a red herring as far as making your code function.
From your comment it sounds like you want to do something like the following with async.mapSeries: http://promise-nuggets.github.io/articles/15-map-in-series.html You could also use promises as it shows or even after getting promises set up use the async/await stuff with a regular for loop after compiling with babel. In the long run I recommend doing that sort of thing with async/await and babel but that might be overkill just to get this working.

Nodejs run promises sequentially

Dears ,
How can i run promises in nodejs sequentially , in the following example am looping through array of hours then for each fetched hour get result from the database , the issue here : am getting results but i want it sequentially same order that i got hours .
angular.forEach(SharedVar.getCategories(), function (h) {
t = h.split('-', 2);
t = t[0];
RESTApi.getAnswerdCallsByHour(t).then(function (answerdTotal) {
$scope.answerdCallsTotalByHour.push(answerdTotal);
var d = SharedVar.getDataS();
d[count] = answerdTotal;
SharedVar.setDataS(d);
count++;
});
});
Thanks ,
var promise = Promise.resolve(); // make an empty promise in the way you do it with your promise library
angular.forEach(SharedVar.getCategories(), function (h) {
promise.then(function() {
return RESTApi.getAnswerdCallsByHour(t).then(function (answerdTotal) {});
});
});
The way to do it sequently would be to do one Request and do the next request inside the promise.
I think the better approach by far is to extend your SharedVar.setDataS(d) function in a way, that it does not depend on getting the data sequentially. Like having a SharedVar.setDataS(d, index) and using the config var in your $http.get (or whatever) functioncall inside your RESTApi to promote that index all the way to the promise.
If your RESTApi looks like this:
var RESTApi = {
getAnswerdCallsByHour : function(hour) {
var url = "bla.com/myservice?hour=" + hour;
return $http.get(url).data;
}
// Some other Methods...
Then you need a way to pass something to "reorder" your Data when it arrives asynchronously, this could be a index you count up or in your case maybe the hour Variable:
var RESTApi = {
getAnswerdCallsByHour : function(hour) {
var url = "bla.com/myservice?hour=" + hour;
var config = [];
config.hour = hour;
return $http.get(url, config); // Return the promise not just data or a specific field
}
// Some other Methods...
Now when your promise is fullfiled you can access your "hour" Variable like so:
var d = SharedVar.getDataS();
d[promise.config.hour] = promise.data;
SharedVar.setDataS(d);
Now you know what piece of data correlates to which request and you do not need to recieve Data in order. The last piece only works properly when hours runs sequential from 0 to 23, if that isn't the case you need to:
var RESTApi = {
getAnswerdCallsByHour : function(hour, index) {
var url = "bla.com/myservice?hour=" + hour;
var config = [];
config.index = index;
return $http.get(url, config);
}
// Some other Methods...
...
...
var d = SharedVar.getDataS();
d[promise.config.index] = promise.data;
SharedVar.setDataS(d);
Safari's answer is how I typically handle this. (Sorry, I don't have enough rep to comment yet...) You were experiencing problems with it because the example provided does not capture and use the new promise in subsequent loops. See my comments on the slightly modified version here:
var promise = Promise.resolve();
angular.forEach(SharedVar.getCategories(), function (h) {
t = h.split('-', 2);
t = t[0];
// You must capture the new promise here; the next loop will wait
// for the promise returned from getAnswerdCallsByHour to resolve.
promise = promise.then(function() {
// Halt downstream promises until this returned promises finishes
return RESTApi.getAnswerdCallsByHour(t).then(function (answerdTotal) {
$scope.answerdCallsTotalByHour.push(answerdTotal);
var d = SharedVar.getDataS();
d[count] = answerdTotal;
SharedVar.setDataS(d);
count++;
});
});
});

How use async functions with node-pdfkit?

I am trying to create a PDF file with PDFKit. I insert an image with like this:
var PDFDocument = require('pdfkit');
var doc = new PDFDocument();
doc.image(some_image_as_buffer);
and it is working like expected. But now want the image be trimmed and I found GraphicsMagick for node.js. But the problem that I have is to make it work with PDFKit. doc.image expects a filename or a buffer, but since I already have a buffer I want to work with buffers (there is no file anywhere because the buffer comes directly from the database).
The trimming works like this:
var gm = require('gm');
gm(some_image_as_buffer, 'image.png')
.trim()
.toBuffer(function(err, trimmed_image_buffer) {
// trimmed_image_buffer is correct,
// but I can't put it to the document like this:
doc.image(trimmed_image_buffer);
// beacause I don't know which page and/or position
// the doc is currently on, because of the asynchronous
// nature of this callback.
});
UPDATE:
For clarification: I want to be able to use the asynchronous trimmed image in the synchronous code for PDFKit. PDFKit only works synchronously and gm doesn't offer a synchronous interface.
UPDATE2:
var gm = require('gm');
gm(some_image_as_buffer, 'image.png')
.trim()
.toBuffer(function(err, trimmed_image_buffer) {
// trimmed_image_buffer is correct,
// but I can't put it to the document like this:
doc.image(trimmed_image_buffer);
// beacause I don't know which page and/or position
// the doc is currently on, because of the asynchronous
// nature of this callback.
});
doc.text('some text');
// is not guaranteed to run after image is inserted
// and a couple of hundred lines more
After the last line in this example there are a lot more lines of code which add content to the PDF, but I don't want to put everything (couple of hundred lines) in one callback just because I need on asynchronous function to manipulate the image.
Is there any way to make this manipulation synchronous?
UPDATE_2
You basically ask for stopping execution of a code until some asynchronous operation has completed. For sure it is not possible in general case.
In case of gm module, it is not possible either. The gm module spawns a new process for executing a command (in your case trim()) and the API for spawning new processes is asynchronous in its very nature.
UPDATE
To make use of promise in your scenario:
var gm = require('gm'),
Q = require('Q'),
PDFDocument = require('pdfkit'),
doc = new PDFDocument();
function getTrimmedImage(some_image_as_buffer){
var deferred = Q.defer();
gm(some_image_as_buffer, 'image.png')
.trim()
.toBuffer(function(err, trimmed_image_buffer) {
if(err) { deferred.reject(err); }
else { deferred.resolve(trimmed_image_buffer); }
});
return deferred.promise;
}
// here goes all manipulations before the trimmed image is inserted
getTrimmedImage(some_image_as_buffer).then(
function(trimmed_image_buffer){
doc.image(trimmed_image_buffer);
// here goes all manipulations after the trimmed image is inserted
}
);
As I wrote in the comment above, a promise based solution should work elegantly. I use Q library, but any other promise library will do the job, as well.
One option would be to collect all resources of asynchronous nature before starting manipulating the pdf. Then you are guaranteed that no race condition occur, though it may slow down the whole process. I used a toy example to have it working in the browser environment, let me know if you have any problems converting it to your use case:
function getAsyncResource(){
var defer = Q.defer();
setTimeout(function(){
var result = "Some value: " + Date.now();
console.log("Async resource resolved: " + result);
defer.resolve(result);
}, Math.random() * 5000);
return defer.promise;
}
function someOperationThatNeedsAsyncResources(A, B, C){
console.log("Have all resources: ", A, B, C);
}
var A = getAsyncResource(),
B = getAsyncResource(),
C = getAsyncResource();
Q.all([A,B,C]).spread(someOperationThatNeedsAsyncResources);
<script src="//cdnjs.cloudflare.com/ajax/libs/q.js/1.1.2/q.js"></script>
Other option would be to split the process into steps, like so:
function getAsyncResource(value){
var defer = Q.defer();
setTimeout(function(){
var result = "Some value: " + value;
console.log("Async resource resolved: " + result);
defer.resolve(result);
}, Math.random() * 5000);
return defer.promise;
}
function nextStep(resource){
console.log("Next step: " + resource);
}
var A = getAsyncResource("A"),
B = getAsyncResource("B"),
C = getAsyncResource("C");
A.then(nextStep)
.then(function(){return B;})
.then(nextStep)
.then(function(){return C;})
.then(nextStep);
<script src="//cdnjs.cloudflare.com/ajax/libs/q.js/1.1.2/q.js"></script>

Resources