limit concurrent operations nodejs

limit concurrent operations nodejs - node.js

This is a web scraping code written in node js.
Will this code always keep 5 concurrent request when queue has enough urls?
Why the console shows otherwise?
var request = require("request");
var cheerio = require("cheerio");
var fs = require('fs');
var concurrent_requests = 0;
var queue = [];
var baseUrl = "https://angularjs.org/";
function makeApiCall(url){
if(url) {
queue.unshift(url);
}
if(concurrent_requests<5) {
var nextUrl = queue.pop();
if(nextUrl) {
concurrent_requests++;
request(nextUrl, function (error, response, body) {
var invalidUrl;
concurrent_requests--;
if(body) {
var $ = cheerio.load(body);
var anchors = $("a");
var data = "";
for (var i = 0; i < anchors.length; i++) {
url = $(anchors[i]).attr("href");
if(!url || url === "#" || url === "javascript:void(0)"){
invalidUrl = true;
}
else{
invalidUrl = false;
}
if (!invalidUrl) {
makeApiCall(url);
data += url + ", " + nextUrl + "\n";
}
}
//console.log(data);
fs.appendFile('urls.csv',data, function (err) {
if (err) throw err;
});
}
else{
makeApiCall();
}
});
}
}
console.log(concurrent_requests);
}
makeApiCall(baseUrl);

Becoz, you have condition that states not to request more than 5 with an if statement.
if(concurrent_requests<5) {
This solution is not scalable as will go over the stack after certain recursive calls.
Hope it helps.

You are using if condition to check if the count of concurrent
requests are less then five or not. But remember it is if statement,
not loop. That means it will be called only once.
You are making a recursive call to your function makeApiCall inside
the callback of the request. The callback of the request only runs
when the request is fulfilled.
With above two points in mind, in your if condition you check if concurrent_requests<5 then you call request method, and your program goes ideal. After sometime when the request id fulfilled, the callback of request runs, which after some logic calls the makeApiCall again. So in every call you are calling request only once and then wait for that to resolve and then only your program proceed for next request.
If you want concurrent request then use a loop like this
function makeApiCall(url){
if(url) {
queue.unshift(url);
}
// Use a loop here
while(concurrent_requests<5) {
var nextUrl = queue.pop();
if(nextUrl) {
concurrent_requests++;
request(nextUrl, function (error, response, body) {
var invalidUrl;
concurrent_requests--;
if(body) {
...
if (!invalidUrl) {
makeApiCall(url);
data += url + ", " + nextUrl + "\n";
}
}
...
}
else{
makeApiCall();
}
});
}
else{
// Remember to break out of loop when queue is empty to avoid infinite loop.
break;
}
}
console.log(concurrent_requests);
}

Related

Intro to node.js - Print data from 3 urls (http.get)

I'm doing an introduction to node.js using learnyounode. I wonder if you could help realize this thing: asynchronism.
So, here is the problem:
This problem is the same as the previous problem (HTTP COLLECT) in
that you need to use http.get(). However, this time you will be
provided with three URLs as the first three command-line
arguments.
You must collect the complete content provided to you by each of the URLs and print it to the console (stdout). You don't need to
print out the length, just the data as a String; one line per URL.
The catch is that you must print them out in the same order as the
URLs are provided to you as command-line arguments.
and here is my bad solution who, in fact, don't work.
var http = require('http');
var message = [];
for (var i = 2; i < 5; i++)
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
res.on('data', function(line) {
message[i] += line.toString();
});
res.on('end', function(line) {
for (var i = 0; i < 3; i++)
console.log(message[i]);
});
});
UPDATE
So I tried a similar approach to your solution.
Here goes:
var http = require('http');
var count = 0;
var message = ["","",""];
for (var i = 2; i < 5; i++)
{
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
res.on('data', function( line ) {
message[count] += line.toString();
});
res.on('end', function(line) {
count++;
if(count !== 3)
return;
else
printOutput();
});
});
}
function printOutput(){
for (var i = 0; i < 3; i++)
console.log(message[i]);
}
But the output is lagged: / (not in the right order)
CURRENT: "He has not got the skite and watch out for the bogged Trent from punchy blue with the dry to the Vinnie's It'll be flanno
where flat out like the slabs..."
EXPECTED: "He's got a massive coldie my watch out for the smoko We're jackaroo going on she'll be right servo dramas.."
CURRENT ". He has not got a banana bender piece of piss the dry as a budgie smugglers Come a flamin clacker you little bog standard
ripper The cross them to his blood's worth bottling flamin the cunning
of a rip snorter.."
EXPECTED: "He has not got the skite and watch out for the bogged Trent from punchy blue with the dry to the Vinnie's It'll be flanno
where flat out like the slabs..."
CURRENT: "He's got a massive coldie my watch out for the smoko We're jackaroo going on she'll be right servo dramas.."
EXPECTED: "He has not got a banana bender piece of piss the dry as a budgie smugglers Come a flamin clacker you little bog standard
ripper The cross them to his blood's worth bottling flamin the cunning
of a rip snorter..."
CURRENT: ""
EXPECTED ""

a more cleaner way to do it asynchronously is by having all Promises in an array and calling Promise.all() on that array
var http = require('http');
promises = [
promiseLoad(process.argv[2]),
promiseLoad(process.argv[3]),
promiseLoad(process.argv[4])
];
Promise.all(promises).then(function(res){
console.log(res);
});
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
res.on('data', function(d) {
body += d;
});
res.on('end', function() {
resolve(body);
});
});
});
}

You have to wait for the prior request to reach the 'end' event before processing the next request, hence the asynchronous challenge. This can be accomplished via callbacks, or promises.
Promise implementation:
var http = require('http');
promiseLoad(process.argv[2])
.then(promiseLoad(process.argv[3])
.then(promiseLoad(process.argv[4]);
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
res.on('data', function(d) {
body += d;
});
res.on('end', function() {
console.log(body);
resolve();
});
});
});
}
I will leave the callback implementation to you as an exercise. As a starting point, the next request will have to be fired only once the end event if fired.
UPDATE:
To load these truly asynchronously and at the same time, your code will work with minor modifications. You need to simply wait for end to be called 3 times and only log at that point indicating that all loading is complete:
var http = require('http');
var count = 0;
var message = [];
for (var i = 2; i < 5; i++)
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
var correctIndex = i;
res.on('data', function(line) {
message[correctIndex] += line.toString();
});
res.on('end', function(line) {
count++;
if(count !== 3) return;
for (var i = 0; i < 3; i++)
console.log(message[i]);
});
});

First, I want to say that the answer already here that says to use Promise.all() is the way that I would suggest. However, I want to point out a particular scenario where it might not cover your needs.
Consider that you have 3 requests:
"Service" | "Time to complete"
----------------------------
A | 3
B | 1
C | 5
D | 4
And you're going to use a load handler similar to what has already been mentioned:
// Url loader
function load(url) {
var message = "";
return new Promise(function (resolve, reject) {
http.get(url, function (res) {
// Add message piece
res.on("data", function (data) {
message += data;
});
// Resolve whole message
res.on("end", function (data) {
resolve(message);
});
});
});
}
Printing After Everything Finishes
If you use the Promise.all(), you are going to have to wait for all of the requests to finish before you see any output. So if we output a timestamp with our data, we will get the following:
Code
/*
Wait for all promises to complete and then
print out all of their collected data
*/
Promise.all(promises).then(function (res) {
res.forEach(function (data) {
timestamp(data);
});
});
Output
[14:9:4.106] Start
[14:9:10.335] aaaa
[14:9:10.336] bbbb
[14:9:10.336] cccc
[14:9:10.336] dddd
Where it takes 6 seconds after we start to see any output from the result of our services.
Printing As Soon As Possible
Comparatively, if we wanted to print output while we are getting results from our service calls, we need to print the result as the service finishes, but not until all "prior" services are done. With that in mind, we could do could do something like this:
Code
promises[0].then(function (dataA) {
timestamp(dataA);
promises[1].then(function (dataB) {
timestamp(dataB);
promises[2].then(function (dataC) {
timestamp(dataC);
promises[3].then(function (dataD) {
timestamp(dataD);
});
});
});
});
Output
[14:16:19.245] Start
[14:16:22.974] aaaa
[14:16:22.975] bbbb
[14:16:25.474] cccc
[14:16:25.474] dddd
Here, we see the start, then only 3 seconds later we print out both Service A and Service B. We see A because its service just resolved and B because its service was already done, but we didn't want to print until A was finished. Similarly, C and D show up about 2 seconds after B.
Now, that code is somewhat verbose, so we could write a recursive function to handle all that nesting for us.
// Function to print an array of promises in order
function cascadeInOrder(promiseArr) {
var curr = 0;
// This closure is going to recursively print out our promises
function nexter(data) {
if (data) {
timestamp(data);
}
// Have the next promise print its data whenever it is done
promiseArr[curr += 1].then(nexter);
}
// Wait for our first promise to finish and have it kick off the next
promiseArr[curr].then(nexter);
}
I haven't really run into many uses cases where we need to make "synchronous" usage of asynchronous data, but I'm sure that there is a potential need for it somewhere.
Test Code Used:
Change the method variable if you want to use the other methods.
/*global Promise*/
"use strict";
// Provide response times for fake services
function getUrlTiming(url) {
var timing = 0;
switch (url) {
case "a":
timing = 3000;
break;
case "b":
timing = 1000;
break;
case "c":
timing = 5000;
break;
case "d":
timing = 4000;
break;
default:
timing = 0;
break;
}
return timing;
}
// Service to wrap events
function Service() {
this.listeners = [];
}
Service.prototype = {
on: function (event, cb) {
if (!this.listeners[event]) {
this.listeners[event] = [];
}
this.listeners[event].push(cb);
},
emit: function (event, details) {
if (this.listeners[event]) {
this.listeners[event].forEach(function (cb) {
cb(details);
});
}
}
};
// Make a fake http module
var http = {
get: function (url, cb) {
// Make an event emiiter
var req = new Service();
// If we got a callback
if (cb && (typeof cb === "function")) {
// Call it to set up listeners
cb(req);
}
// Make a promise to resolve after the service finishes
return new Promise(function (resolve, reject) {
var network,
message = "",
part = 0,
maxParts = 4;
/*
Create a network simulation to send a massage in parts
until the request finishes
*/
network = setInterval(function () {
// If the message isn't complete
if (part < 4) {
// Add to the whole message tracker
message += url;
// Emit that we got data
req.emit("data", url);
// Increment how far in the message we are
part += 1;
} else {
// Stop transmitting
clearInterval(network);
// Emit the end of the request
req.emit("end", message);
// Resolve the request
resolve(url);
}
}, (getUrlTiming(url) / maxParts));
});
}
};
// Url loader
function load(url) {
var message = "";
return new Promise(function (resolve, reject) {
http.get(url, function (res) {
// Add message piece
res.on("data", function (data) {
message += data;
});
// Resolve whole message
res.on("end", function (data) {
resolve(message);
});
});
});
}
// Get a readable time
function getTime() {
var now = new Date();
return (now.getHours() + ":" + now.getMinutes() + ":" + now.getSeconds() + "." + now.getMilliseconds());
}
// Print a timestamped message
function timestamp(message) {
console.log("[%s] %s", getTime(), message);
}
// Function to print an array of promises in order
function cascadeInOrder(promiseArr) {
var curr = 0;
// This closure is going to recursively print out our promises
function nexter(data) {
if (data) {
timestamp(data);
}
// Have the next promise print its data whenever it is done
promiseArr[curr += 1].then(nexter);
}
// Wait for our first promise to finish and have it kick off the next
promiseArr[curr].then(nexter);
}
/*
No matter what, we want all of our requests to
start right now, and effectively at the same time.
We don't want to start one after another finishes
*/
var promises = [
load("a"),
load("b"),
load("c"),
load("d")
];
/*
Which method we want to use to test our stuff
Change between [1, 2, 3] for each method listed
below. 1 for Promise.all(), 2 for ASAP printing,
and 3 for the verbose version of 2.
*/
var method = 3;
// Note when we started
timestamp("Start");
if (method === 1) {
/*
Wait for all promises to complete and then
print out all of their collected data
*/
Promise.all(promises).then(function (res) {
res.forEach(function (data) {
timestamp(data);
});
});
} else if (method === 2) {
/*
Print each ones data as soon as it is
available; but make sure to do it in order
*/
cascadeInOrder(promises);
} else if (method === 3) {
/*
This is the same as the "cascadeInOrder" function,
except written without recursion and more verbosely.
*/
promises[0].then(function (dataA) {
timestamp(dataA);
promises[1].then(function (dataB) {
timestamp(dataB);
promises[2].then(function (dataC) {
timestamp(dataC);
promises[3].then(function (dataD) {
timestamp(dataD);
});
});
});
});
}

#Luís Melo
Here's my solution after going through this thread:
var http = require('http');
var bl = require('bl')
promises = [
promiseLoad(process.argv[2]),
promiseLoad(process.argv[3]),
promiseLoad(process.argv[4])
];
Promise.all(promises).then(function(res) {
for(i=0; i<promises.length; i++) {
console.log(res[i]);
}
});
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function (response) {
response.setEncoding('utf8');
response.pipe(bl(function (err, data) {
resolve(data.toString())
}))
})
});
}
Here's the official solution in case you want to compare notes:
var http = require('http')
var bl = require('bl')
var results = []
var count = 0
function printResults () {
for (var i = 0; i < 3; i++) {
console.log(results[i])
}
}
function httpGet (index) {
http.get(process.argv[2 + index], function (response) {
response.pipe(bl(function (err, data) {
if (err) {
return console.error(err)
}
results[index] = data.toString()
count++
if (count === 3) {
printResults()
}
}))
})
}
for (var i = 0; i < 3; i++) {
httpGet(i)
}

Node.js + Cheerio : Request inside a loop

I'm using cheerio, request and Node.js.
When I run the script below, it outputs names in a wrong order. I believe that it's caused by asynchronous nature of it, how can I make it work in the "right" order? Do I need to use a sync package or is there a way to change it in a way so it'll work in a sync way?
app.get('/returned', function (req, res) {
for (var y = 0; y < 10; y++) {
var url = "http://example.com" + y + "/person.html";
request(url, function (err, resp, body) {
$ = cheerio.load(body);
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("returned null");
} else {
console.log(name);
}
});
}
});

Promise makes this relatively easy:
app.get('/returned', function (req, res) {
let urls = [];
for (let y = 0; y < 10; y++) {
urls.push('http://example.com' + y + '/person.html');
}
Promise.all(urls.map(function (url) {
return new Promise(resolve, reject) {
request(url, function (err, resp, body) {
if (err) {return reject(err);}
let $ = cheerio.load(body);
let links = $('#container');
let name = links.find('span[itemprop="name"]').html(); // name
resolve({name: name, links: links, url: url});
});
});
}).then(function (result) {
result.forEach(function (obj) {
if (obj.name == null) {
console.log(obj.url, "returned null");
} else {
console.log(obj.url, obj.name);
}
});
}).catch(function (err) {
console.log(err);
});
});
I started by creating an array of urls to get, then I mapped that to an array of promises. When each of the requests are complete, i resolved the promise with the name, url, and links. When all promises were complete, I then looped over the result which will will be in the original order. This runs in parallel.

Nope, you shouldn't have to use a sync package. IMO the cleanest way is to use a mature 3rd party library.
I'd recommend async.
The async.series method would execute all request functions in the order they are given, then allow you to register a callback to fire when all requests have been made, or when an error has occurred.
https://github.com/caolan/async#seriestasks-callback

Async piping file to an HTTP request

I'm trying to send off interleaved GET and POST requests to a server, but the POST request is sending data from a file, which seems to throw off the timing.
var async = require('async');
var http = require('http');
var request = require('request');
var fs = require('fs');
var arr = [];
for (var i = 1; i <= 50; i++) {
arr.push(i);
}
var limitedAgent = new http.Agent({maxSockets: 6});
function processThenSendRequest(data, onfinish) {
request.get({
url: 'http://www.google.com',
pool: limitedAgent
}, (function(j) {
return function(err, res) {
console.log("GET: response from " + j);
};
})(data)).on('socket', (function(j) {
return function(socket) {
console.log("GET: socket assigned for " + j);
}
})(data));
var source = fs.createReadStream('README.md');
var postReq = request.post({
url: 'http://www.google.com',
pool: limitedAgent
}, (function(j) {
return function(err, res) {
console.log("POST: response from " + j);
};
})(data)).on('socket', (function(j) {
return function(socket) {
console.log("POST: socket assigned for " + j);
}
})(data));
// source.pipe(postReq);
setTimeout(function() {
onfinish(null, data);
}, 10000);
}
async.map(arr, processThenSendRequest, function(err, results) {
if (err) console.error(err);
console.log("finished");
});
The code as written above runs fine, with the GET and POST requests being sent out in alternating order, but if I uncomment the source.pipe(postReq) line, then all the GET requests are sent before all the POST requests.
Is there a solution to this issue? I could use async.mapLimit but that feels like a hack and that the solution should be through the request library - this impression may be based on a misunderstanding though.

Per my comment:
Because Node is entirely non-blocking (at least when written this way) you can't be sure anything will occur in order unless you run it in series. async.series can also do this for you, or async.eachSeries.
Further to that, since Node doesn’t wait for asynchronous activities to finish, each task gets queued up immediately, while the callbacks (the event completion event) will occur on a first-come-first-serve basis. In your case, since GET requests take far less time to go around than POST requests, that's why they're completing first.

Trying to make my own RxJs observable

I'm trying to convert an existing API to work with RxJS... fairly new to node, and very new to RxJs, so please bear with me.
I have an existing API (getNextMessage), that either blocks (asynchronously), or returns a new item or error via a node-style (err, val) callback, when the something becomes available.
so it looks something like:
getNextMessage(nodeStyleCompletionCallback);
You could think of getNextMessage like an http request, that completes in the future, when the server responds, but you do need to call getNextMessage again, once a message is received, to keep getting new items from the server.
So, in order to make it into an observable collection, I have to get RxJs to keep calling my getNextMessage function until the subscriber is disposed();
Basically, I'm trying to create my own RxJs observable collection.
The problems are:
I don't know how to make subscriber.dispose() kill the async.forever
I probably shouldn't be using async.forever in the first place
I'm not sure I should be even getting 'completed' for each message - shouldn't that be at the end of a sequence
I'd like to eventually remove the need for using fromNodeCallback, to have a first class RxJS observable
Clearly I'm a little confused.
Would love a bit of help, thanks!
Here is my existing code:
var Rx = require('rx');
var port = require('../lib/port');
var async = require('async');
function observableReceive(portName)
{
var observerCallback;
var listenPort = new port(portName);
var disposed = false;
var asyncReceive = function(asyncCallback)
{
listenPort.getNextMessage(
function(error, json)
{
observerCallback(error, json);
if (!disposed)
setImmediate(asyncCallback);
}
);
}
return function(outerCallback)
{
observerCallback = outerCallback;
async.forever(asyncReceive);
}
}
var receive = Rx.Observable.fromNodeCallback(observableReceive('rxtest'));
var source = receive();
var subscription = source.forEach(
function (json)
{
console.log('receive completed: ' + JSON.stringify(json));
},
function (error) {
console.log("receive failed: " + error.toString());
},
function () {
console.log('Completed');
subscription.dispose();
}
);

So here's probably what I would do.
var Rx = require('Rx');
// This is just for kicks. You have your own getNextMessage to use. ;)
var getNextMessage = (function(){
var i = 1;
return function (callback) {
setTimeout(function () {
if (i > 10) {
callback("lawdy lawd it's ova' ten, ya'll.");
} else {
callback(undefined, i++);
}
}, 5);
};
}());
// This just makes an observable version of getNextMessage.
var nextMessageAsObservable = Rx.Observable.create(function (o) {
getNextMessage(function (err, val) {
if (err) {
o.onError(err);
} else {
o.onNext(val);
o.onCompleted();
}
});
});
// This repeats the call to getNextMessage as many times (11) as you want.
// "take" will cancel the subscription after receiving 11 items.
nextMessageAsObservable
.repeat()
.take(11)
.subscribe(
function (x) { console.log('next', x); },
function (err) { console.log('error', err); },
function () { console.log('done'); }
);

I realize this is over a year old, but I think a better solution for this would be to make use of recursive scheduling instead:
Rx.Observable.forever = function(next, scheduler) {
scheduler = scheduler || Rx.Scheduler.default,
//Internally wrap the the callback into an observable
next = Rx.Observable.fromNodeCallback(next);
return Rx.Observable.create(function(observer) {
var disposable = new Rx.SingleAssignmentDisposable(),
hasState = false;
disposable.setDisposable(scheduler.scheduleRecursiveWithState(null,
function(state, self) {
hasState && observer.onNext(state);
hasState = false;
next().subscribe(function(x){
hasState = true;
self(x);
}, observer.onError.bind(observer));
}));
return disposable;
});
};
The idea here is that you can schedule new items once the previous one has completed. You call next() which invokes the passed in method and when it returns a value, you schedule the next item for invocation.
You can then use it like so:
Rx.Observable.forever(getNextMessage)
.take(11)
.subscribe(function(message) {
console.log(message);
});
See a working example here

NodeJS async queue too fast (Slowing down async queue method)

I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?

You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.

var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);

Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.

I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})

A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

limit concurrent operations nodejs - node.js

Becoz, you have condition that states not to request more than 5 with an if statement. if(concurrent_requests<5) { This solution is not scalable as will go over the stack after certain recursive calls. Hope it helps.

Related

Intro to node.js - Print data from 3 urls (http.get)

Node.js + Cheerio : Request inside a loop

Async piping file to an HTTP request

Trying to make my own RxJs observable

NodeJS async queue too fast (Slowing down async queue method)

Categories

Resources