Intro to node.js - Print data from 3 urls (http.get)

Intro to node.js - Print data from 3 urls (http.get) - node.js

I'm doing an introduction to node.js using learnyounode. I wonder if you could help realize this thing: asynchronism.
So, here is the problem:
This problem is the same as the previous problem (HTTP COLLECT) in
that you need to use http.get(). However, this time you will be
provided with three URLs as the first three command-line
arguments.
You must collect the complete content provided to you by each of the URLs and print it to the console (stdout). You don't need to
print out the length, just the data as a String; one line per URL.
The catch is that you must print them out in the same order as the
URLs are provided to you as command-line arguments.
and here is my bad solution who, in fact, don't work.
var http = require('http');
var message = [];
for (var i = 2; i < 5; i++)
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
res.on('data', function(line) {
message[i] += line.toString();
});
res.on('end', function(line) {
for (var i = 0; i < 3; i++)
console.log(message[i]);
});
});
UPDATE
So I tried a similar approach to your solution.
Here goes:
var http = require('http');
var count = 0;
var message = ["","",""];
for (var i = 2; i < 5; i++)
{
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
res.on('data', function( line ) {
message[count] += line.toString();
});
res.on('end', function(line) {
count++;
if(count !== 3)
return;
else
printOutput();
});
});
}
function printOutput(){
for (var i = 0; i < 3; i++)
console.log(message[i]);
}
But the output is lagged: / (not in the right order)
CURRENT: "He has not got the skite and watch out for the bogged Trent from punchy blue with the dry to the Vinnie's It'll be flanno
where flat out like the slabs..."
EXPECTED: "He's got a massive coldie my watch out for the smoko We're jackaroo going on she'll be right servo dramas.."
CURRENT ". He has not got a banana bender piece of piss the dry as a budgie smugglers Come a flamin clacker you little bog standard
ripper The cross them to his blood's worth bottling flamin the cunning
of a rip snorter.."
EXPECTED: "He has not got the skite and watch out for the bogged Trent from punchy blue with the dry to the Vinnie's It'll be flanno
where flat out like the slabs..."
CURRENT: "He's got a massive coldie my watch out for the smoko We're jackaroo going on she'll be right servo dramas.."
EXPECTED: "He has not got a banana bender piece of piss the dry as a budgie smugglers Come a flamin clacker you little bog standard
ripper The cross them to his blood's worth bottling flamin the cunning
of a rip snorter..."
CURRENT: ""
EXPECTED ""

a more cleaner way to do it asynchronously is by having all Promises in an array and calling Promise.all() on that array
var http = require('http');
promises = [
promiseLoad(process.argv[2]),
promiseLoad(process.argv[3]),
promiseLoad(process.argv[4])
];
Promise.all(promises).then(function(res){
console.log(res);
});
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
res.on('data', function(d) {
body += d;
});
res.on('end', function() {
resolve(body);
});
});
});
}

You have to wait for the prior request to reach the 'end' event before processing the next request, hence the asynchronous challenge. This can be accomplished via callbacks, or promises.
Promise implementation:
var http = require('http');
promiseLoad(process.argv[2])
.then(promiseLoad(process.argv[3])
.then(promiseLoad(process.argv[4]);
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function(res) {
res.on('data', function(d) {
body += d;
});
res.on('end', function() {
console.log(body);
resolve();
});
});
});
}
I will leave the callback implementation to you as an exercise. As a starting point, the next request will have to be fired only once the end event if fired.
UPDATE:
To load these truly asynchronously and at the same time, your code will work with minor modifications. You need to simply wait for end to be called 3 times and only log at that point indicating that all loading is complete:
var http = require('http');
var count = 0;
var message = [];
for (var i = 2; i < 5; i++)
http.get(process.argv[i], function (res) {
res.setEncoding('utf8');
var correctIndex = i;
res.on('data', function(line) {
message[correctIndex] += line.toString();
});
res.on('end', function(line) {
count++;
if(count !== 3) return;
for (var i = 0; i < 3; i++)
console.log(message[i]);
});
});

First, I want to say that the answer already here that says to use Promise.all() is the way that I would suggest. However, I want to point out a particular scenario where it might not cover your needs.
Consider that you have 3 requests:
"Service" | "Time to complete"
----------------------------
A | 3
B | 1
C | 5
D | 4
And you're going to use a load handler similar to what has already been mentioned:
// Url loader
function load(url) {
var message = "";
return new Promise(function (resolve, reject) {
http.get(url, function (res) {
// Add message piece
res.on("data", function (data) {
message += data;
});
// Resolve whole message
res.on("end", function (data) {
resolve(message);
});
});
});
}
Printing After Everything Finishes
If you use the Promise.all(), you are going to have to wait for all of the requests to finish before you see any output. So if we output a timestamp with our data, we will get the following:
Code
/*
Wait for all promises to complete and then
print out all of their collected data
*/
Promise.all(promises).then(function (res) {
res.forEach(function (data) {
timestamp(data);
});
});
Output
[14:9:4.106] Start
[14:9:10.335] aaaa
[14:9:10.336] bbbb
[14:9:10.336] cccc
[14:9:10.336] dddd
Where it takes 6 seconds after we start to see any output from the result of our services.
Printing As Soon As Possible
Comparatively, if we wanted to print output while we are getting results from our service calls, we need to print the result as the service finishes, but not until all "prior" services are done. With that in mind, we could do could do something like this:
Code
promises[0].then(function (dataA) {
timestamp(dataA);
promises[1].then(function (dataB) {
timestamp(dataB);
promises[2].then(function (dataC) {
timestamp(dataC);
promises[3].then(function (dataD) {
timestamp(dataD);
});
});
});
});
Output
[14:16:19.245] Start
[14:16:22.974] aaaa
[14:16:22.975] bbbb
[14:16:25.474] cccc
[14:16:25.474] dddd
Here, we see the start, then only 3 seconds later we print out both Service A and Service B. We see A because its service just resolved and B because its service was already done, but we didn't want to print until A was finished. Similarly, C and D show up about 2 seconds after B.
Now, that code is somewhat verbose, so we could write a recursive function to handle all that nesting for us.
// Function to print an array of promises in order
function cascadeInOrder(promiseArr) {
var curr = 0;
// This closure is going to recursively print out our promises
function nexter(data) {
if (data) {
timestamp(data);
}
// Have the next promise print its data whenever it is done
promiseArr[curr += 1].then(nexter);
}
// Wait for our first promise to finish and have it kick off the next
promiseArr[curr].then(nexter);
}
I haven't really run into many uses cases where we need to make "synchronous" usage of asynchronous data, but I'm sure that there is a potential need for it somewhere.
Test Code Used:
Change the method variable if you want to use the other methods.
/*global Promise*/
"use strict";
// Provide response times for fake services
function getUrlTiming(url) {
var timing = 0;
switch (url) {
case "a":
timing = 3000;
break;
case "b":
timing = 1000;
break;
case "c":
timing = 5000;
break;
case "d":
timing = 4000;
break;
default:
timing = 0;
break;
}
return timing;
}
// Service to wrap events
function Service() {
this.listeners = [];
}
Service.prototype = {
on: function (event, cb) {
if (!this.listeners[event]) {
this.listeners[event] = [];
}
this.listeners[event].push(cb);
},
emit: function (event, details) {
if (this.listeners[event]) {
this.listeners[event].forEach(function (cb) {
cb(details);
});
}
}
};
// Make a fake http module
var http = {
get: function (url, cb) {
// Make an event emiiter
var req = new Service();
// If we got a callback
if (cb && (typeof cb === "function")) {
// Call it to set up listeners
cb(req);
}
// Make a promise to resolve after the service finishes
return new Promise(function (resolve, reject) {
var network,
message = "",
part = 0,
maxParts = 4;
/*
Create a network simulation to send a massage in parts
until the request finishes
*/
network = setInterval(function () {
// If the message isn't complete
if (part < 4) {
// Add to the whole message tracker
message += url;
// Emit that we got data
req.emit("data", url);
// Increment how far in the message we are
part += 1;
} else {
// Stop transmitting
clearInterval(network);
// Emit the end of the request
req.emit("end", message);
// Resolve the request
resolve(url);
}
}, (getUrlTiming(url) / maxParts));
});
}
};
// Url loader
function load(url) {
var message = "";
return new Promise(function (resolve, reject) {
http.get(url, function (res) {
// Add message piece
res.on("data", function (data) {
message += data;
});
// Resolve whole message
res.on("end", function (data) {
resolve(message);
});
});
});
}
// Get a readable time
function getTime() {
var now = new Date();
return (now.getHours() + ":" + now.getMinutes() + ":" + now.getSeconds() + "." + now.getMilliseconds());
}
// Print a timestamped message
function timestamp(message) {
console.log("[%s] %s", getTime(), message);
}
// Function to print an array of promises in order
function cascadeInOrder(promiseArr) {
var curr = 0;
// This closure is going to recursively print out our promises
function nexter(data) {
if (data) {
timestamp(data);
}
// Have the next promise print its data whenever it is done
promiseArr[curr += 1].then(nexter);
}
// Wait for our first promise to finish and have it kick off the next
promiseArr[curr].then(nexter);
}
/*
No matter what, we want all of our requests to
start right now, and effectively at the same time.
We don't want to start one after another finishes
*/
var promises = [
load("a"),
load("b"),
load("c"),
load("d")
];
/*
Which method we want to use to test our stuff
Change between [1, 2, 3] for each method listed
below. 1 for Promise.all(), 2 for ASAP printing,
and 3 for the verbose version of 2.
*/
var method = 3;
// Note when we started
timestamp("Start");
if (method === 1) {
/*
Wait for all promises to complete and then
print out all of their collected data
*/
Promise.all(promises).then(function (res) {
res.forEach(function (data) {
timestamp(data);
});
});
} else if (method === 2) {
/*
Print each ones data as soon as it is
available; but make sure to do it in order
*/
cascadeInOrder(promises);
} else if (method === 3) {
/*
This is the same as the "cascadeInOrder" function,
except written without recursion and more verbosely.
*/
promises[0].then(function (dataA) {
timestamp(dataA);
promises[1].then(function (dataB) {
timestamp(dataB);
promises[2].then(function (dataC) {
timestamp(dataC);
promises[3].then(function (dataD) {
timestamp(dataD);
});
});
});
});
}

#Luís Melo
Here's my solution after going through this thread:
var http = require('http');
var bl = require('bl')
promises = [
promiseLoad(process.argv[2]),
promiseLoad(process.argv[3]),
promiseLoad(process.argv[4])
];
Promise.all(promises).then(function(res) {
for(i=0; i<promises.length; i++) {
console.log(res[i]);
}
});
function promiseLoad(url) {
var body = '';
return new Promise(function(resolve, reject) {
http.get(url, function (response) {
response.setEncoding('utf8');
response.pipe(bl(function (err, data) {
resolve(data.toString())
}))
})
});
}
Here's the official solution in case you want to compare notes:
var http = require('http')
var bl = require('bl')
var results = []
var count = 0
function printResults () {
for (var i = 0; i < 3; i++) {
console.log(results[i])
}
}
function httpGet (index) {
http.get(process.argv[2 + index], function (response) {
response.pipe(bl(function (err, data) {
if (err) {
return console.error(err)
}
results[index] = data.toString()
count++
if (count === 3) {
printResults()
}
}))
})
}
for (var i = 0; i < 3; i++) {
httpGet(i)
}

Related

How to do a search in rows?

I'm just learning not to judge strictly. I try to do Rest Client. And I want to do a data search by rows.
At the moment I just want to try to find the given data in the array. And if one of the lines is the same, the message OK will be displayed, and in the others No. But I have all messages displays No. How to fix it?
var data = querystring.stringify({
Name: "CEO"
})
var req = http.request(options, function(res) {
console.log("status: " + res.statusCode);
var content = ''
res.on('data', function(chunk) {
content += chunk
for (var i = 0; i < content.length; i++) {
if (content[i] === data) {
console.log('Ок')
} else {
console.log('No')
}
}
});
}).on('error', function(e) {
console.log("error: " + e.message);
});
req.write(data)
req.end()
-
console.log(`body: ${(chunk)}`):
Answer:
body: {"rowsCount":75,"rows":[..., {"Id":75,"Name":"CEO"}]}

There are some programming / logical problem with your code that prevent you from getting the desired result.
One of the most important parts (sometimes the trickiest one) of programming is to know the types of data you are dealing with then handle them properly.
As in the problem you are resolving, the data returned from the API is in JSON format, and can be treated as object in javascript. You don't have to do a string concat to append them to content variable. By appending chunk to the content variable, you turn the whole set of data return to you to a string, doing a for loop on a string have a very different result compared with on an array.
The second issue is if you look into the data structure that returns to you through the api, the list of records you want to search on is under the rows attribute, therefore, you should loop through the chunk.rows only.
A suggesting fix for your code will be as follows:
var searchQuery = 'SEO';
var req = http.request(options, function(res) {
res.on('data', function(chunk) {
for (var i = 1; i <= chunk.rows.length; i++) {
if (chunk.rows[i].Name === searchQuery) {
console.log('Ок')
} else {
console.log('No')
}
}
});
}).on('error', function(e) {
console.log("error: " + e.message);
});
req.write(data)
req.end()

Your for-loop will skip the first element of the data you get back
You aren't accessing the desired search property correctly.
Your query is a string for which you are testing equality against an object.
You need to access the rows property of your incoming data and then perform your query:
Attempt one:
const rowsToSearch = content.rows; // #2
for (var i = 0; i < rowsToSearch.length; i++) { // #1
if (rowsToSearch[i].Name === 'CEO') { // #3
console.log('Ок')
} else {
console.log('No')
}
}
Attempt two, using lodash:
const query = { Name: 'CEO' };
const rowsToSearch = chunk.rows;
for (var i = 0; i < rowsToSearch.length; i++) {
if (_.isEqual(rowsToSearch[i], query)) {
console.log('Ок')
} else {
console.log('No')
}
}
Attempt 3, using ES6 iteration and lodash:
const query = { Name: 'CEO' };
const rowsToSearch = chunk.rows;
const result = rowsToSearch.find(row => _.isEqual(row, query));
result ? console.log('OK') : console.log('No');

delaying requests using request and cheerio modules

So this is the code I used to crawl my pages (i'm using request and cheerio modules:
for (let j = 1; j < nbRequest; j++)
{
const currentPromise = new Promise((resolve, reject) => {
request(
`https://www.url${j}`,
(error, response, body) => {
if (error || !response) {
console.log("Error: " + error);
}
console.log("Status code: " + response.statusCode + ", Connected to the page");
var $ = cheerio.load(body);
let output = {
ranks: [],
names: [],
numbers: [],
};
$('td.rangCell').each(function( index ) {
if ($(this).text().trim() != "Rang")
{
output.ranks.push($(this).text().trim().slice(0, -1));
nbRanks = nb_ranks+1;
}
});
$('td.nameCell:has(label)').each(function( index ) {
output.names.push($(this).find('label.nameValue > a').text().trim());
});
$('td.numberCell').each(function( index ) {
if ($(this).text().trim() != "Nombre")
{
output.numbers.push($(this).text().trim());
}
});
console.log("HERE 1");
return resolve(output);
}
);
});
promises.push(currentPromise);
}
after that I'm parsing and saving the result in a csv file using a node module.
At this point i've been able to crawl about 100 pages, but when it comes to much bigger numbers (1000+) I'm receiving a 500 response meaning that i'm being kicked i think.
So i think the best solution is to delay requests, but i didn't find the solution.
Do you guys have any idea and how the code would look like ?

what you are looking for is called "Control Flow", you can achieve this by using async.queue for example.
If you add every request to the the queue you can control the amount of parallel requests with the amount of workers. And you could add setTimeouts to the final part of the request's callback to achieve the delaying of requests.
Additionally I'd suggest using a "crawler" package (instead of building your own) e.g. npm-crawler as they ship with build in rate-limiting and have already taken care of other things that you might face next :) e.g. user-agent pool
Update:
const async = require("async");
const delayTime = 1500; //wait 1,5 seconds after every new request
getRequestPromise(csvLine){
return new Promise( make you request here );
}
const asyncQueue = async.queue(function(task, callback) {
getRequestPromise(task).then(_ => {
setTimeout(() => {
callback(null);
}, delayTime);
});
}, 1); //1 one request at a time
for(csv){ //pseudo
asyncQueue.push(csv[i], () => {});
}
asyncQueue.drain = () => {
console.log("finished.");
};

How to handle callbacks in a for loop(Node.JS)

I am trying to write a code with NodeJS where I grab data from an external API and then populate them in MongoDB using Mongoose. In between that, I'll check to see if that particular already exists in Mongo or not. Below is my code.
router.route('/report') // the REST api address
.post(function(req, res) // calling a POST
{
console.log('calling report API');
var object = "report/" + reportID; // related to the API
var parameters = '&limit=100' // related to the API
var url = link + object + apiKey + parameters; // related to the API
var data = "";
https.get(url, function callback(response)
{
response.setEncoding("utf8");
response.on("data", function(chunk)
{
data += chunk.toString() + "";
});
response.on("end", function()
{
var jsonData = JSON.parse(data);
var array = jsonData['results']; // data is return in array of objects. accessing only a particular array
var length = array.length;
console.log(length);
for (var i = 0; i < length; i++)
{
var report = new Report(array.pop()); // Report is the schema model defined.
console.log('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^');
console.log(i);
console.log('*****************************');
console.log(report);
console.log('*****************************');
// console.log(report['id']);
/*report.save(function(err)
{
if(err)
res.send(err);
});*/
Report.find({id:report['id']}).count(function(err, count) // checks if the id of that specific data already exists in Mongo
{
console.log(count);
console.log('*****************************');
if (count == 0) // if the count = 0, meaning not exist, then only save
{
report.save(function(err)
{
console.log('saved');
if(err)
res.send(err);
});
}
});
};
res.json({
message: 'Grabbed Report'
});
});
response.on("error", console.error);
});
})
My problem is that since NodeJS callbacks are parallel, it is not getting called sequentially. My end result would be something like this :
Calling report API
console.log(length) = 100
^^^^^^^^^^^^^^^^^^^^^^^^
console.log(i) = starts with 0
*******************************
console.log(report) = the data which will be stored inside Mongo
*******************************
number 3 - 7 repeats 100 times as the length is equals to 100
console.log(count) = either 0 or 1
number 9 repeats 100 times
console.log('saved')
number 11 repeats 100 times
Lastly, only the last out of 100 data is stored into Mongo
What I need is some sort of technique or method to handle these callbacks which are executing one after the other and not sequentially following the loop. I am pretty sure this is the problem as my other REST APIs are all working.
I have looked into async methods, promises, recursive functions and a couple others non which I could really understand how to solve this problem. I really hope someone can shed some light into this matter.
Feel free also to correct me if I did any mistakes in the way I'm asking the question. This is my first question posted in StackOverflow.

This problem is termed as the "callback hell".
There's lots of other approaches like using Promise and Async libraries you'll find.
I'm more excited about the native async ES7 will bring,
which you can actually start using today with transpiler library Babel.
But by far the simplest approach I've found is the following:
You take out the long callback functions and define them outside.
router.route('/report') // the REST api address
.post(calling_a_POST)
function calling_a_POST(req, res) {
...
var data = "";
https.get(url, function callback(response) {
...
response.on("end", response_on_end_callback); // --> take out
response.on("error", console.error);
});
}
function response_on_end_callback() { // <-- define here
...
for (var i = 0; i < length; i++) {
var report = new Report(array.pop());
...
Report.find({ id: report['id'] })
.count(Report_find_count_callback); // --> take out
};
res.json({
message: 'Grabbed Report'
});
}
function Report_find_count_callback(err, count) { // <-- define here
...
if (count == 0) {
report.save(function(err) { // !! report is undefined here
console.log('saved');
if (err)
res.send(err); // !! res is undefined here
});
}
}
A caveat is that you won't be able to access all the variables inside what used to be the callback,
because you've taken them out of the scope.
This could be solved with a "dependency injection" wrapper of sorts to pass the required variables.
router.route('/report') // the REST api address
.post(calling_a_POST)
function calling_a_POST(req, res) {
...
var data = "";
https.get(url, function callback(response) {
...
response.on("end", function(err, data){ // take these arguments
response_on_end(err, data, res); // plus the needed variables
});
response.on("error", console.error);
});
}
function response_on_end(err, data, res) { // and pass them to function defined outside
...
for (var i = 0; i < length; i++) {
var report = new Report(array.pop());
...
Report.find({ id: report['id'] })
.count(function(err, count){
Report_find_count(err, count, report, res); // same here
});
};
res.json({ // res is now available
message: 'Grabbed Report'
});
}
function Report_find_count(err, count, report, res) { // same here
...
if (count == 0) {
report.save(function(err) { // report is now available
console.log('saved');
if (err)
res.send(err); // res is now available
});
}
}
When I execute the response_on_end function, I am getting the undefined:1 unexpected token u error.
I am pretty much sure it has something to do with this line: var jsonData = JSON.parse(data)
My response_on_end is as below: var jsonData = JSON.parse(data); // problem here
I realize I made an error here:
function calling_a_POST(req, res) {
...
var data = "";
https.get(url, function callback(response) {
...
//sponse.on("end", function(err, data){
response.on("end", function(err){ // data shouldn't be here
response_on_end(err, data, res);
});
response.on("error", console.error);
});
}
Another problem I could forsee, which actually may not arise here but still would be better to talk about anyways.
The data variable, since it's a string which is a primitive type unlike an object, it is "passed by value".
More info
It's better to wrap the variable in an object and pass the object, because objects in javascript are always "passed by reference".
function calling_a_POST(req, res) {
...
// var data = ""; //
var data_wrapper = {};
data_wrapper.data = {}; // wrap it in an object
https.get(url, function callback(response) {
...
response.on("data", function(chunk){
data_wrapper.data += chunk.toString() + ""; // use the dot notation to reference
});
response.on("end", function(err){
response_on_end(err, data_wrapper, res); // and pass that object
});
response.on("error", console.error);
});
}
function response_on_end_callback(err, data_wrapper, res) {
var data = data_wrapper.data; // later redefine the variable
...
for (var i = 0; i < length; i++) {
var report = new Report(array.pop());
...

You can use async library for controlling your execution flows. And there are also iterators for working with arrays.

NodeJS async queue too fast (Slowing down async queue method)

I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?

You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.

var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);

Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.

I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})

A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);

Limiting asynchronous calls in Node.js

I've got a Node.js app that gets a list of file locally and uploads them to a server. This list could contain thousands of files.
for (var i = 0; i < files.length; i++) {
upload_file(files[i]);
}
If I execute this with thousands of files, upload_file will get called thousands of times all at once, and most likely die (or at least struggle). In the synchronous world, we'd create a thread pool and limit it to a certain number of threads. Is there a simple way to limit how many asynchronous calls get executed at once?

As usual, I recommend Caolan McMahon's async module.
Make your upload_file function take a callback as it's second parameter:
var async = require("async");
function upload_file(file, callback) {
// Do funky stuff with file
callback();
}
var queue = async.queue(upload_file, 10); // Run ten simultaneous uploads
queue.drain = function() {
console.log("All files are uploaded");
};
// Queue your files for upload
queue.push(files);
queue.concurrency = 20; // Increase to twenty simultaneous uploads

The answer above, re: async on NPM is the best answer, but if you'd like to learn more about control flow:
You should look into control flow patterns. There's a wonderful discussion on control flow patterns in Chapter 7 of Mixu's Node Book. Namely, I'd look at the example in 7.2.3: Limited parallel - an asynchronous, parallel, concurrency limited for loop.
I've adapted his example:
function doUpload() {
// perform file read & upload here...
}
var files = [...];
var limit = 10; // concurrent read / upload limit
var running = 0; // number of running async file operations
function uploader() {
while(running < limit && files.length > 0) {
var file = files.shift();
doUpload(file, function() {
running--;
if(files.length > 0)
uploader();
});
running++;
}
}
uploader();

You should try queueing. I assume that a callback is fired when upload_file() finishes. Something like this should do the trick (untested):
function upload_files(files, maxSimultaneousUploads, callback) {
var runningUploads = 0,
startedUploads = 0,
finishedUploads = 0;
function next() {
runningUploads--;
finishedUploads++;
if (finishedUploads == files.length) {
callback();
} else {
// Make sure that we are running at the maximum capacity.
queue();
}
}
function queue() {
// Run as many uploads as possible while not exceeding the given limit.
while (startedUploads < files.length && runningUploads < maxSimultaneousUploads) {
runningUploads++;
upload_file(files[startedUploads++], next);
}
}
// Start the upload!
queue();
}

The others answers seem to be outdated. This can be solved easily using paralleLimit from async. Below is how to use it. I haven't tested it.
var tasks = files.map(function(f) {
return function(callback) {
upload_file(f, callback)
}
});
parallelLimit(tasks, 10, function(){
});

No external libraries. Just plain JS.
It can be resolved using recursion.
The idea is that initially we immediately start the maximum allowed number of uploads and each of these requests should recursively initiate a new upload on its completion.
In this example I populate successful responses together with errors and I execute all requests but it's possible to slightly modify algorithm if you want to terminate batch upload on the first failure.
async function batchUpload(files, limit) {
limit = Math.min(files.length, limit);
return new Promise((resolve, reject) => {
const responsesOrErrors = new Array(files.length);
let startedCount = 0;
let finishedCount = 0;
let hasErrors = false;
function recursiveUpload() {
let index = startedCount++;
uploadFile(files[index])
.then(res => {
responsesOrErrors[index] = res;
})
.catch(error => {
responsesOrErrors[index] = error;
hasErrors = true;
})
.finally(() => {
finishedCount++;
if (finishedCount === files.length) {
hasErrors ? reject(responsesOrErrors) : resolve(responsesOrErrors);
} else if (startedCount < files.length) {
recursiveUpload();
}
});
}
for (let i = 0; i < limit; i++) {
recursiveUpload();
}
});
}
async function uploadFile(file) {
console.log(`${file} started`);
const delay = Math.floor(Math.random() * 1500);
return new Promise((resolve, reject) => {
setTimeout(() => {
if (delay <= 1000) {
console.log(`${file} finished successfully`);
resolve(`${file} success`);
} else {
console.log(`${file} finished with error`);
reject(`${file} error`);
}
}, delay);
});
}
const files = new Array(10).fill('file').map((file, index) => `${file}_${index + 1}`);
batchUpload(files, 3)
.then(responses => console.log('All successfull', responses))
.catch(responsesWithErrors => console.log('All with several failed', responsesWithErrors));

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Intro to node.js - Print data from 3 urls (http.get) - node.js

Related

How to do a search in rows?

delaying requests using request and cheerio modules

How to handle callbacks in a for loop(Node.JS)

NodeJS async queue too fast (Slowing down async queue method)

Limiting asynchronous calls in Node.js

Categories

Resources