Throttling intensive IO task in node.js

Throttling intensive IO task in node.js - node.js

I'm playing around with node.js, trying to re-write a particularly poorly designed part of my production system at work. So far, so good, I use rabbitmq for messaging, and my node.js part of the system runs ghostscript command line tool to convert tiff files to pdf. Obviously I need to make sure I'm not running more than some fixed amount of conversions at a time. What would be the best way to do this with node? I understand that maybe node.js isn't really about running heavy disk IO stuff, but I'm having too much fun with it to quit.
I was considering just using a blocking call to execute command line utilities but the thing is that some messages don't require this conversion and there's no need to delay their processing.

[Update] node-batches seems more appropriate.
I think you need something like forEachLimit (the following snippet was extracted from the async library)
forEachLimit = function (arr, limit, iterator, callback) {
callback = callback || function () {};
if (!arr.length || limit <= 0) {
return callback();
}
var completed = 0;
var started = 0;
var running = 0;
(function replenish () {
if (completed === arr.length) {
return callback();
}
while (running < limit && started < arr.length) {
iterator(arr[started], function (err) {
if (err) {
callback(err);
callback = function () {};
}
else {
completed += 1;
running -= 1;
if (completed === arr.length) {
callback();
}
else {
replenish();
}
}
});
started += 1;
running += 1;
}
})();
};
Usage:
var fileToConvert = ['file1', 'file2', 'file3']
maxConcurrency = 4;
function fnIter(item, callback){
console.log('converting', item);
// Convertion happen here
require('child_process').exec("some -f "+item, function(error, stdout, stderr){
callback(stderr); // stderr should be "null" if everything went good.
});
}
function fnDone(){
console.log('done !');
}
forEachLimit(fileToConvert, maxConcurrency, fnIter, fnDone);

Related

Nodejs - Re-Calling function on error callback - Is there a non blocking way?

I got a function which makes a request to an API. Sometimes the API got some hiccups and isnt available for a second or two every now and then, resulting in an error on which I'd like to call the function again. Since there are another 70~80 lines of code following this callback, I wouldnt like to split the flow with an if(error) <do the same stuff> else <as here>
After trying for quite some time I ended up using a do-while(error) loop, which works but blocks. Is there an async way of doing this?
My code (simplified for generalization):
//This is the request part
function bar(j, callback){
j++;
//simulated error
if(j<=10){
return( callback('dont', j) );
}
//simulated success
else{
return( callback(null, j) );
}
}
//This is the main function - part of a bigger piece in my code
function foo(){
var i = 0;
var err = 'yes'; //else we'd get an 'err is not defined'
do{
bar(i, function(error, j){
i = j
err = error;
if(error){
console.log(i);
}
else{
return( console.log('done it!') );
// There's more here in my code
}
});
} while (err);
console.log('I blocked');
}
foo();
Edit:
For those interested, this is the output:
1
2
3
4
5
6
7
8
9
10
done it!
I blocked

What I would suggest is that you make a function for your operation. If it fails, you set a short timer and retry after that timer fires. This will give you an asynchronous behavior between retries and other code in the sever can run.
function requestRetry(url, data, retryTimes, retryDelay, callback) {
var cntr = 0;
function run() {
// try your async operation
request(..., function(err, data) {
++cntr;
if (err) {
if (cntr >= retryTimes) {
// if it fails too many times, just send the error out
callback(err);
} else {
// try again after a delay
setTimeout(run, retryDelay);
}
} else {
// success, send the data out
callback(null, data);
}
});
}
// start our first request
run();
}
requestRetry(someUrl, someData, 10, 500, function(err, data) {
if (err) {
// still failed after 10 retries
} else {
// got successful result here
}
});
This is a fairly simple retry scheme, it just retries on a fixed interval for a fixed number of times. More complicated schemes implement a back-off algorithm where they start with fairly quick retries, but then back-off to a longer period of time between retries after the first few failures to gives the server a better chance of recovering. If there happening to be lots and lots of clients all doing rapid retries, then you as soon as your server has a hiccup, you can get an avalanche failure as all the clients suddenly start rapidly retrying which just puts your serve in even more trouble trying to handle all those requests. The back-off algorithm is designed to allow a server a better chance of preventing an avalanche failure and make it easier for it to recover.
The back-off scheme is also more appropriate if you're waiting for the service to come back online after it's been down a little while.

For retry http call when it come with error. but first you need to check this error be retry-able or not.
RETRIABLE_NETWORK_ERRORS = ['ECONNRESET', 'ENOTFOUND', 'ESOCKETTIMEDOUT', 'ETIMEDOUT', 'ECONNREFUSED', 'EHOSTUNREACH', 'EPIPE', 'EAI_AGAIN'];
If error comes under RETRIABLE_NETWORK_ERRORS than you need to come under retry logic otherwise mark as error.
For retry logic use exponential backoff algorithm. you follow https://developers.google.com/api-client-library/java/google-http-java-client/backoff
const _ = require('lodash');
const request = require('request');
var httpArray = getHttpReq(); //return array
function makeHttpRequest() {
_.each(httpArray, function (httpRequest) {
retryRequest(httpRequest);
});
}
function retryRequest(httpRequest) {
const MAX_RETRY = 2;
var retryCnt = 0;
Retry();
function Retry() {
request(httpRequest, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body)
}
else {
if (retryCnt < MAX_RETRY) {
retryCnt += 1;
var currRetryIntervalMs = (1 << retryCnt) * 1000; //exponential back off logic
setTimeout(Retry, currRetryIntervalMs);
}
else {
console.log('http fail');
}
}
});
}
}

Here is an asynchronous loop
function asyncLoop(i, range, callback) {
var results = 0;
if(i < range) {
// do something, update results
aysncLoop(i+1, range, callback);
} else {
callback(null, results)
}
}
To loop it 10 times, call it as follows
asyncLoop(0, 10, function(err, results) {
console.log(results);
});
Pass your error condition in place of range and check it inside the loop. Hope this will help you

Intro to node.js - Print data from 3 urls (http.get)

I'm doing an introduction to node.js using learnyounode. I wonder if you could help realize this thing: asynchronism.
So, here is the problem:
This problem is the same as the previous problem (HTTP COLLECT) in
that you need to use http.get(). However, this time you will be
provided with three URLs as the first three command-line
arguments.
You must collect the complete content provided to you by each of the URLs and print it to the console (stdout). You don't need to
print out the length, just the data as a String; one line per URL.
The catch is that you must print them out in the same order as the
URLs are provided to you as command-line arguments.
and here is my bad solution who, in fact, don't work.
var http = require('http');
var message = [];
for (var i = 2; i < 5; i++)
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
res.on('data', function(line) {
message[i] += line.toString();
});
res.on('end', function(line) {
for (var i = 0; i < 3; i++)
console.log(message[i]);
});
});
UPDATE
So I tried a similar approach to your solution.
Here goes:
var http = require('http');
var count = 0;
var message = ["","",""];
for (var i = 2; i < 5; i++)
{
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
res.on('data', function( line ) {
message[count] += line.toString();
});
res.on('end', function(line) {
count++;
if(count !== 3)
return;
else
printOutput();
});
});
}
function printOutput(){
for (var i = 0; i < 3; i++)
console.log(message[i]);
}
But the output is lagged: / (not in the right order)
CURRENT: "He has not got the skite and watch out for the bogged Trent from punchy blue with the dry to the Vinnie's It'll be flanno
where flat out like the slabs..."
EXPECTED: "He's got a massive coldie my watch out for the smoko We're jackaroo going on she'll be right servo dramas.."
CURRENT ". He has not got a banana bender piece of piss the dry as a budgie smugglers Come a flamin clacker you little bog standard
ripper The cross them to his blood's worth bottling flamin the cunning
of a rip snorter.."
EXPECTED: "He has not got the skite and watch out for the bogged Trent from punchy blue with the dry to the Vinnie's It'll be flanno
where flat out like the slabs..."
CURRENT: "He's got a massive coldie my watch out for the smoko We're jackaroo going on she'll be right servo dramas.."
EXPECTED: "He has not got a banana bender piece of piss the dry as a budgie smugglers Come a flamin clacker you little bog standard
ripper The cross them to his blood's worth bottling flamin the cunning
of a rip snorter..."
CURRENT: ""
EXPECTED ""

a more cleaner way to do it asynchronously is by having all Promises in an array and calling Promise.all() on that array
var http = require('http');
promises = [
promiseLoad(process.argv[2]),
promiseLoad(process.argv[3]),
promiseLoad(process.argv[4])
];
Promise.all(promises).then(function(res){
console.log(res);
});
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
res.on('data', function(d) {
body += d;
});
res.on('end', function() {
resolve(body);
});
});
});
}

You have to wait for the prior request to reach the 'end' event before processing the next request, hence the asynchronous challenge. This can be accomplished via callbacks, or promises.
Promise implementation:
var http = require('http');
promiseLoad(process.argv[2])
.then(promiseLoad(process.argv[3])
.then(promiseLoad(process.argv[4]);
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
res.on('data', function(d) {
body += d;
});
res.on('end', function() {
console.log(body);
resolve();
});
});
});
}
I will leave the callback implementation to you as an exercise. As a starting point, the next request will have to be fired only once the end event if fired.
UPDATE:
To load these truly asynchronously and at the same time, your code will work with minor modifications. You need to simply wait for end to be called 3 times and only log at that point indicating that all loading is complete:
var http = require('http');
var count = 0;
var message = [];
for (var i = 2; i < 5; i++)
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
var correctIndex = i;
res.on('data', function(line) {
message[correctIndex] += line.toString();
});
res.on('end', function(line) {
count++;
if(count !== 3) return;
for (var i = 0; i < 3; i++)
console.log(message[i]);
});
});

First, I want to say that the answer already here that says to use Promise.all() is the way that I would suggest. However, I want to point out a particular scenario where it might not cover your needs.
Consider that you have 3 requests:
"Service" | "Time to complete"
----------------------------
A | 3
B | 1
C | 5
D | 4
And you're going to use a load handler similar to what has already been mentioned:
// Url loader
function load(url) {
var message = "";
return new Promise(function (resolve, reject) {
http.get(url, function (res) {
// Add message piece
res.on("data", function (data) {
message += data;
});
// Resolve whole message
res.on("end", function (data) {
resolve(message);
});
});
});
}
Printing After Everything Finishes
If you use the Promise.all(), you are going to have to wait for all of the requests to finish before you see any output. So if we output a timestamp with our data, we will get the following:
Code
/*
Wait for all promises to complete and then
print out all of their collected data
*/
Promise.all(promises).then(function (res) {
res.forEach(function (data) {
timestamp(data);
});
});
Output
[14:9:4.106] Start
[14:9:10.335] aaaa
[14:9:10.336] bbbb
[14:9:10.336] cccc
[14:9:10.336] dddd
Where it takes 6 seconds after we start to see any output from the result of our services.
Printing As Soon As Possible
Comparatively, if we wanted to print output while we are getting results from our service calls, we need to print the result as the service finishes, but not until all "prior" services are done. With that in mind, we could do could do something like this:
Code
promises[0].then(function (dataA) {
timestamp(dataA);
promises[1].then(function (dataB) {
timestamp(dataB);
promises[2].then(function (dataC) {
timestamp(dataC);
promises[3].then(function (dataD) {
timestamp(dataD);
});
});
});
});
Output
[14:16:19.245] Start
[14:16:22.974] aaaa
[14:16:22.975] bbbb
[14:16:25.474] cccc
[14:16:25.474] dddd
Here, we see the start, then only 3 seconds later we print out both Service A and Service B. We see A because its service just resolved and B because its service was already done, but we didn't want to print until A was finished. Similarly, C and D show up about 2 seconds after B.
Now, that code is somewhat verbose, so we could write a recursive function to handle all that nesting for us.
// Function to print an array of promises in order
function cascadeInOrder(promiseArr) {
var curr = 0;
// This closure is going to recursively print out our promises
function nexter(data) {
if (data) {
timestamp(data);
}
// Have the next promise print its data whenever it is done
promiseArr[curr += 1].then(nexter);
}
// Wait for our first promise to finish and have it kick off the next
promiseArr[curr].then(nexter);
}
I haven't really run into many uses cases where we need to make "synchronous" usage of asynchronous data, but I'm sure that there is a potential need for it somewhere.
Test Code Used:
Change the method variable if you want to use the other methods.
/*global Promise*/
"use strict";
// Provide response times for fake services
function getUrlTiming(url) {
var timing = 0;
switch (url) {
case "a":
timing = 3000;
break;
case "b":
timing = 1000;
break;
case "c":
timing = 5000;
break;
case "d":
timing = 4000;
break;
default:
timing = 0;
break;
}
return timing;
}
// Service to wrap events
function Service() {
this.listeners = [];
}
Service.prototype = {
on: function (event, cb) {
if (!this.listeners[event]) {
this.listeners[event] = [];
}
this.listeners[event].push(cb);
},
emit: function (event, details) {
if (this.listeners[event]) {
this.listeners[event].forEach(function (cb) {
cb(details);
});
}
}
};
// Make a fake http module
var http = {
get: function (url, cb) {
// Make an event emiiter
var req = new Service();
// If we got a callback
if (cb && (typeof cb === "function")) {
// Call it to set up listeners
cb(req);
}
// Make a promise to resolve after the service finishes
return new Promise(function (resolve, reject) {
var network,
message = "",
part = 0,
maxParts = 4;
/*
Create a network simulation to send a massage in parts
until the request finishes
*/
network = setInterval(function () {
// If the message isn't complete
if (part < 4) {
// Add to the whole message tracker
message += url;
// Emit that we got data
req.emit("data", url);
// Increment how far in the message we are
part += 1;
} else {
// Stop transmitting
clearInterval(network);
// Emit the end of the request
req.emit("end", message);
// Resolve the request
resolve(url);
}
}, (getUrlTiming(url) / maxParts));
});
}
};
// Url loader
function load(url) {
var message = "";
return new Promise(function (resolve, reject) {
http.get(url, function (res) {
// Add message piece
res.on("data", function (data) {
message += data;
});
// Resolve whole message
res.on("end", function (data) {
resolve(message);
});
});
});
}
// Get a readable time
function getTime() {
var now = new Date();
return (now.getHours() + ":" + now.getMinutes() + ":" + now.getSeconds() + "." + now.getMilliseconds());
}
// Print a timestamped message
function timestamp(message) {
console.log("[%s] %s", getTime(), message);
}
// Function to print an array of promises in order
function cascadeInOrder(promiseArr) {
var curr = 0;
// This closure is going to recursively print out our promises
function nexter(data) {
if (data) {
timestamp(data);
}
// Have the next promise print its data whenever it is done
promiseArr[curr += 1].then(nexter);
}
// Wait for our first promise to finish and have it kick off the next
promiseArr[curr].then(nexter);
}
/*
No matter what, we want all of our requests to
start right now, and effectively at the same time.
We don't want to start one after another finishes
*/
var promises = [
load("a"),
load("b"),
load("c"),
load("d")
];
/*
Which method we want to use to test our stuff
Change between [1, 2, 3] for each method listed
below. 1 for Promise.all(), 2 for ASAP printing,
and 3 for the verbose version of 2.
*/
var method = 3;
// Note when we started
timestamp("Start");
if (method === 1) {
/*
Wait for all promises to complete and then
print out all of their collected data
*/
Promise.all(promises).then(function (res) {
res.forEach(function (data) {
timestamp(data);
});
});
} else if (method === 2) {
/*
Print each ones data as soon as it is
available; but make sure to do it in order
*/
cascadeInOrder(promises);
} else if (method === 3) {
/*
This is the same as the "cascadeInOrder" function,
except written without recursion and more verbosely.
*/
promises[0].then(function (dataA) {
timestamp(dataA);
promises[1].then(function (dataB) {
timestamp(dataB);
promises[2].then(function (dataC) {
timestamp(dataC);
promises[3].then(function (dataD) {
timestamp(dataD);
});
});
});
});
}

#Luís Melo
Here's my solution after going through this thread:
var http = require('http');
var bl = require('bl')
promises = [
promiseLoad(process.argv[2]),
promiseLoad(process.argv[3]),
promiseLoad(process.argv[4])
];
Promise.all(promises).then(function(res) {
for(i=0; i<promises.length; i++) {
console.log(res[i]);
}
});
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function (response) {
response.setEncoding('utf8');
response.pipe(bl(function (err, data) {
resolve(data.toString())
}))
})
});
}
Here's the official solution in case you want to compare notes:
var http = require('http')
var bl = require('bl')
var results = []
var count = 0
function printResults () {
for (var i = 0; i < 3; i++) {
console.log(results[i])
}
}
function httpGet (index) {
http.get(process.argv[2 + index], function (response) {
response.pipe(bl(function (err, data) {
if (err) {
return console.error(err)
}
results[index] = data.toString()
count++
if (count === 3) {
printResults()
}
}))
})
}
for (var i = 0; i < 3; i++) {
httpGet(i)
}

How node.js implement async-callback with single process

I don't know how node implement its amazing idea. And i have a question when use it.
I have to read four files file1.js file2.js file3.js file4.js and concat them into one big javascript file result.js. It's important to keep their order.
So it's normal for me to use readFileSync instead of readFile.
I know it's a bad solution. Anyone has a good idea to do that?
Q: Is it possible for node.js to read four files at the same time?
Hope someone can explain the principle of node.js and when process.nextTick will be fired.

A: yes it is possible for node to read 4 files at the same time.
My answer would be, it depends on your situation, for reading the files synchronously or asynchronously. If it's configuration data, or the files can be cached, I would suggest just doing it synchronously, it's easy, and it's only done once. So you won't be waiting around very much. Long operations on initialization are typical, and can make things in the long run more efficient. That being said, reading four files in order, asynchronously, so that your program can do other things in the background isn't that hard. I will work on sync and async examples of each and add an edit.
/* jshint node:true*/
var fs = require('fs');
function readFilesSync(fileNames) {
'use strict';
var results = '';
for (var i = 0; i < fileNames.length; i++) {
results += fs.readFileSync(fileNames[i]);
}
return results;
}
function readFiles(fileNames, callback) {
'use strict';
var results = '';
function readFile(index) {
if (index < fileNames.length) {
fs.readFile(fileNames[index], function (err, data) {
results += data;
readFile(index + 1);
});
} else {
callback(results);
}
}
readFile(0);
}
function readAllFilesAtOnce(fileNames, callback) {
'use strict';
var results = {};
var numFiles = fileNames.length;
function callBackWrapper() {
var resultsOrdered = '';
for (var i = 0; i < fileNames.length; i++) {
resultsOrdered += results[fileNames[i]];
}
callback(resultsOrdered);
}
function readFileAsync(fileName) {
fs.readFile(fileName, function (err, data) {
results[fileName] = data;
numFiles--;
if (numFiles === 0) {
callBackWrapper();
}
});
}
for (var i = 0; i < fileNames.length; i++) {
readFileAsync(fileNames[i]);
}
}
function doSomethingWithTheData(data) {
'use strict';
console.log('Results async: ' + data);
}
function doSomethingWithTheData2(data) {
'use strict';
console.log('Results async all at once: ' + data);
}
var fileNamesArray = ['blah.js', 'file.js', 'hello.txt'];
console.log('The results sync: ' + readFilesSync(fileNamesArray));
readFiles(fileNamesArray, doSomethingWithTheData);
readAllFilesAtOnce(fileNamesArray, doSomethingWithTheData2);
EDIT: There I added a method to read all of the files at once.
Process.nextTick does no more than process this function on the next time around the event loop. EX:
process.nextTick(function() {
console.log('never printed out');
});
while(true);
ex 2:
process.nextTick(function() {
console.log('printed last');
});
console.log('printed first');

NodeJS async queue too fast (Slowing down async queue method)

I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?

You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.

var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);

Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.

I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})

A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);

Limiting asynchronous calls in Node.js

I've got a Node.js app that gets a list of file locally and uploads them to a server. This list could contain thousands of files.
for (var i = 0; i < files.length; i++) {
upload_file(files[i]);
}
If I execute this with thousands of files, upload_file will get called thousands of times all at once, and most likely die (or at least struggle). In the synchronous world, we'd create a thread pool and limit it to a certain number of threads. Is there a simple way to limit how many asynchronous calls get executed at once?

As usual, I recommend Caolan McMahon's async module.
Make your upload_file function take a callback as it's second parameter:
var async = require("async");
function upload_file(file, callback) {
// Do funky stuff with file
callback();
}
var queue = async.queue(upload_file, 10); // Run ten simultaneous uploads
queue.drain = function() {
console.log("All files are uploaded");
};
// Queue your files for upload
queue.push(files);
queue.concurrency = 20; // Increase to twenty simultaneous uploads

The answer above, re: async on NPM is the best answer, but if you'd like to learn more about control flow:
You should look into control flow patterns. There's a wonderful discussion on control flow patterns in Chapter 7 of Mixu's Node Book. Namely, I'd look at the example in 7.2.3: Limited parallel - an asynchronous, parallel, concurrency limited for loop.
I've adapted his example:
function doUpload() {
// perform file read & upload here...
}
var files = [...];
var limit = 10; // concurrent read / upload limit
var running = 0; // number of running async file operations
function uploader() {
while(running < limit && files.length > 0) {
var file = files.shift();
doUpload(file, function() {
running--;
if(files.length > 0)
uploader();
});
running++;
}
}
uploader();

You should try queueing. I assume that a callback is fired when upload_file() finishes. Something like this should do the trick (untested):
function upload_files(files, maxSimultaneousUploads, callback) {
var runningUploads = 0,
startedUploads = 0,
finishedUploads = 0;
function next() {
runningUploads--;
finishedUploads++;
if (finishedUploads == files.length) {
callback();
} else {
// Make sure that we are running at the maximum capacity.
queue();
}
}
function queue() {
// Run as many uploads as possible while not exceeding the given limit.
while (startedUploads < files.length && runningUploads < maxSimultaneousUploads) {
runningUploads++;
upload_file(files[startedUploads++], next);
}
}
// Start the upload!
queue();
}

The others answers seem to be outdated. This can be solved easily using paralleLimit from async. Below is how to use it. I haven't tested it.
var tasks = files.map(function(f) {
return function(callback) {
upload_file(f, callback)
}
});
parallelLimit(tasks, 10, function(){
});

No external libraries. Just plain JS.
It can be resolved using recursion.
The idea is that initially we immediately start the maximum allowed number of uploads and each of these requests should recursively initiate a new upload on its completion.
In this example I populate successful responses together with errors and I execute all requests but it's possible to slightly modify algorithm if you want to terminate batch upload on the first failure.
async function batchUpload(files, limit) {
limit = Math.min(files.length, limit);
return new Promise((resolve, reject) => {
const responsesOrErrors = new Array(files.length);
let startedCount = 0;
let finishedCount = 0;
let hasErrors = false;
function recursiveUpload() {
let index = startedCount++;
uploadFile(files[index])
.then(res => {
responsesOrErrors[index] = res;
})
.catch(error => {
responsesOrErrors[index] = error;
hasErrors = true;
})
.finally(() => {
finishedCount++;
if (finishedCount === files.length) {
hasErrors ? reject(responsesOrErrors) : resolve(responsesOrErrors);
} else if (startedCount < files.length) {
recursiveUpload();
}
});
}
for (let i = 0; i < limit; i++) {
recursiveUpload();
}
});
}
async function uploadFile(file) {
console.log(`${file} started`);
const delay = Math.floor(Math.random() * 1500);
return new Promise((resolve, reject) => {
setTimeout(() => {
if (delay <= 1000) {
console.log(`${file} finished successfully`);
resolve(`${file} success`);
} else {
console.log(`${file} finished with error`);
reject(`${file} error`);
}
}, delay);
});
}
const files = new Array(10).fill('file').map((file, index) => `${file}_${index + 1}`);
batchUpload(files, 3)
.then(responses => console.log('All successfull', responses))
.catch(responsesWithErrors => console.log('All with several failed', responsesWithErrors));

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Throttling intensive IO task in node.js - node.js

Related

Nodejs - Re-Calling function on error callback - Is there a non blocking way?

Intro to node.js - Print data from 3 urls (http.get)

How node.js implement async-callback with single process

NodeJS async queue too fast (Slowing down async queue method)

Limiting asynchronous calls in Node.js

Categories

Resources