Node.js: non-blocking code - node.js

I'm just starting off with Node.js and struggling with some of the finer points of non-blocking (asynchronous?) code. I know there are lots of questions about blocking vs non-blocking code already, but after reading through some of them, I still couldn't sort out this issue.
As a learning exercise, I made a simple script that loads URLs from a file, queries them using the request module, and notifies me if a URL is the New York Times homepage.
Here is a MWE:
// CSV Parse test
'use strict';
var request = require('request');
var fs = require('fs');
var parse = require('csv-parse');
var text = fs.readFileSync('input.txt','utf8');
var r_isnyt = /New York Times/;
var data = [];
parse(text, {skip_empty_lines: true}, function(err, data){
for (var r = 0; r < data.length; r++) {
console.log ('Logging from within parse function:');
console.log ('URL: '+data[r][0]+'\n');
var url = data[r][0];
request(url, function(error, response, body) {
console.log ('Logging from within request function:');
console.log('Loading URL: '+url+'\n');
if (!error && response.statusCode == 200) {
if (r_isnyt.exec(body)){
console.log('This is the NYT site! ');
}
console.log ('');
}
});
}
});
And here is my input.txt:
http://www.nytimes.com/
www.google.com
From what I understood of non-blocking code, this program's flow would be:
parse(text, {skip_empty_lines: true}, function(err, data){ loads the data and returns the lines of the input file in a 2D array, which is complete and available right after that line.
For Loop iterates through it, loading URLs with the line request(url, function(error, response, body) {, which is non-blocking (right?), so the For loop continues without waiting for the previous URL to finish loading.
As a result, you could have multiple URLs being loaded at once, and the console.log calls within request will print in the order the responses are received, not the order of the input file.
Within request, which has access to the results of the request to url, we print the URL, check whether it's the New York Times, and print the result of that check (all blocking steps I thought).
That's a long-winded way of getting around to my question. I just wanted to clarify that I thought I understood the basic concepts of non-blocking code. So what's baffling me is that my output is as follows:
>node parsecsv.js
Logging from within parse function:
URL: http://www.nytimes.com/
Logging from within parse function:
URL: www.google.com
Logging from within request function:
Loading URL: www.google.com
Logging from within request function:
Loading URL: www.google.com
This is the NYT site!
>
I understand why the request printouts all happen together at the end, but why do they both print Google, and much more baffling, why does the last one say it's the NYT site, when the log line right before it (from within the same request call) has just printed Google? It's like the request calls are getting the correct URLs, but the console.log calls are lagging, and just print everything at the end with the ending values.
Interestingly, if I reverse the order of the URLs, everything looks correct in the output, I guess because of differences in response times from the sites:
node parsecsv.js
Logging from within parse function:
URL: www.google.com
Logging from within request function:
Loading URL: www.google.com
Logging from within parse function:
URL: http://www.nytimes.com/
Logging from within request function:
Loading URL: http://www.nytimes.com/
This is the NYT site!
>
Thanks in advance.
Update
Based on the answer from jfriend00 below, I've changed my code to use a .forEach loop instead as follows. This appears to fix the issue.
// CSV Parse test
'use strict';
var request = require('request');
var fs = require('fs');
var parse = require('csv-parse');
var text = fs.readFileSync('input.txt','utf8');
var r_isnyt = /New York Times/;
var data = [];
parse(text, {skip_empty_lines: true}, function(err, data){
data.forEach( function(row) {
console.log ('Logging from within parse function:');
console.log ('URL: '+row[0]+'\n');
let url = row[0];
request(url, function(error, response, body) {
console.log ('Logging from within request function:');
console.log('Loading URL: '+url+'\n');
if (!error && response.statusCode == 200) {
if (r_isnyt.exec(body)){
console.log('This is the NYT site! ');
}
console.log ('');
}
});
});
});

I understand why the request printouts all happen together at the end,
but why do they both print Google, and much more baffling, why does
the last one say it's the NYT site, when the log line right before it
(from within the same request call) has just printed Google? It's like
the request calls are getting the correct URLs, but the console.log
calls are lagging, and just print everything at the end with the
ending values.
You correctly understand that the for loop initiates all the request() calls and then they finish sometime later in whatever order the responses come back in.
But, your logging statement:
console.log('Loading URL: '+url+'\n');
refers to a variable in your for loop which is shared by all the iterations of your for loop. So, since the for loop runs to completion and THEN sometime later all the responses arrive and get processed, your for loop will have finished by the time any of the responses get processed and thus the variable url will have whatever value it has in it when the for loop finishes which will be the value from the last iteration of the for loop.
In ES6, you can define the variable with let instead of var and it will be block scopes so there will be a unique variable url for each iteration of the loop.
So, change:
var url = data[r][0];
to
let url = data[r][0];
Prior to ES6, a common way to avoid this issue is to use .forEach() to iterate since it takes a callback function so all your loop code is in its own scope by nature of how .forEach() works and thus each iteration has its own local variables rather than shared local variables.
FYI, though let solves this issue and is one of the things it was designed for, I think your code would probably be a bit cleaner if you just used .forEach() for your iteration since it would replace multiple references to data[r] with a single reference to the current array iteration value.
parse(text, {skip_empty_lines: true}, function(err, data){
data.forEach( function(row) {
console.log ('Logging from within parse function:');
console.log ('URL: '+row[0]+'\n');
let url = row[0];
request(url, function(error, response, body) {
console.log ('Logging from within request function:');
console.log('Loading URL: '+url+'\n');
if (!error && response.statusCode == 200) {
if (r_isnyt.exec(body)){
console.log('This is the NYT site! ');
}
console.log ('');
}
});
});
});

Your code is fine and you're correct about how it works (including that differences in response times are what's making everything seem good when you switch the order around), but your logging has fallen victim to an unexpected closure: url is declared and updated in the scope of the parse() callback, and in the case where www.google.com is logged both times, it is being updated to its final value by the loop before your request() callbacks start executing.

Related

Getting a JSON from a website Node JS

So I'm fairly new to node js, and am having trouble wrapping my head around asynchronous programming. I'm trying to get a JSON from a website and pass it to a variable for use later, to test I have been using this code:
var https = require("https");
var a;
function getter(url){
var request = https.get(url, function(response){
var body = "";
response.on("data", function(chunk){
body += chunk;
});
response.on("end", function(){
if(response.statusCode === 200){
try{
a = JSON.parse(body);
}catch(err){
console.log(err);
}
}
})
})
};
getter('https://api.nasa.gov/planetary/apod?api_key=DEMO_KEY');
console.log(a);
When I run this I get a as undefined, which seems to make sense from what I've read. But I'm unclear as to what to do from here. How would I go about passing this JSON into a variable?
http.get is asynchronous and executes the event handlers when the events occur. When you call getter() this function immediately returns, ie it does not wait for the events and the next statement console.log(a) is executed.
Furthermore, js is single threaded, and the current execution stack is never interrupted for any other event/callback or whatsoever. So the event handlers can only run if the current execution has come to an end, ie contains noch more statements. Thus, your console.log() will always be executed before any eventhandler of the request, thus a is still undefined.
If you want to continue after the request finished, you have to do it from the eventhandler.
See this excellent presentation for some more details https://youtu.be/8aGhZQkoFbQ

Node.js request function return data

I have gone through questions about returning data from a node JS request call. A common mistake is assuming statements are executed line by line or synchronously, which is not the case here. A question of this type: How to get data out of a Node.js http get request.
My question is a bit different. I have written a function getNumber() which returns the number of results for given query. I am wondering how I return a value retrieved by the callback function? So for instance:
function getNumResults() {
Request(url, function(response) {
var responseJSON = JSON.parse(body);
return(responseJSON.results.count);
});
}
function Request(URL, callback) {
request(URL, function(error, response, body) {
console.log('URL: ', URL);
console.log('error: ', error);
console.log('statusCode: ', response && response.statusCode);
console.log('body: ', body);
callback(body);
});
}
What if I want getNumResults() to return responseJSON.results.count? How could I do this?
What if I want getNumResults() to return responseJSON.results.count? How could I do this?
You can't directly return an async value from getNumResults(). You just can't. The function returns long before the value is even available. It's a matter of timing. That's how async responses work. They finish some indeterminate time in the future, but they are non-blocking so the rest of your Javascript continues to run and thus the function returns before the result is even available.
The ONLY way to get the result out is with a callback of some kind. That applies to both your Request() function and to our getNumResults() function. Once a result is asynchronous, nobody in the calling chain can escape that. Async is infectious and you can never go from async back to synchronous. So, if your getNumResults() wants to get the value back to its caller, it will either have to accept a callback itself and call that callback when it gets the value or it will have to return a promise that is resolved with the value.
Here's how you could do this using promises (which are the future of async development in Javascript):
// load a version of the request library that returns promise instead of
// taking plain callbacks
const rp = require('request-promise');
function getNumResults(url) {
// returns a promise
return rp(url).then(body => {
// make the count be the resolved value of the promise
let responseJSON = JSON.parse(body);
return responseJSON.results.count;
});
}
Then, you would use getNumResults() like this"
getNumResults(someURL).then(count => {
// use the count result in here
console.log(`Got count = ${count}`);
}).catch(err => {
console.log('Got error from getNumResults ', err);
});
FYI, I think you can also get the request() library to parse your JSON response for you automatically if you want by setting an appropriate option {json: true}.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Concatenating API responses in the correct sequence asynchronously

So I'm building a simple wrapper around an API to fetch all results of a particular entity. The API method can only return up to 500 results at a time, but it's possible to retrieve all results using the skip parameter, which can be used to specify what index to start retrieving results from. The API also has a method which returns the number of results there are that exist in total.
I've spent some time battling using the request package, trying to come up with a way to concatenate all the results in order, then execute a callback which passes all the results through.
This is my code currently:
Donedone.prototype.getAllActiveIssues = function(callback){
var url = this.url;
request(url + `/issues/all_active.json?take=500`, function (error, response, body) {
if (!error && response.statusCode == 200) {
var data = JSON.parse(body);
var totalIssues = data.total_issues;
var issues = [];
for (let i=0; i < totalIssues; i+=500){
request(url + `/issues/all_active.json?skip=${i}&take=500`, function (error, response, body){
if (!error && response.statusCode == 200) {
console.log(JSON.parse(body).issues.length);
issues.concat(JSON.parse(body).issues);
console.log(issues); // returns [] on all occasions
//callback(issues);
} else{
console.log("AGHR");
}
});
}
} else {
console.log("ERROR IN GET ALL ACTIVE ISSUES");
}
});
};
So I'm starting off with an empty array, issues. I iterate through a for loop, each time increasing i by 500 and passing that as the skip param. As you can see, I'm logging the length of how many issues each response contains before concatenating them with the main issues variable.
The output, from a total of 869 results is this:
369
[]
500
[]
Why is my issues variable empty when I log it out? There are clearly results to concatenate with it.
A more general question: is this approach the best way to go about what I'm trying to achieve? I figured that even if my code did work, the nature of asynchronicity means it's entirely possible for the results to be concatenated in the wrong order.
Should I just use a synchronous request library?
Why is my issues variable empty when I log it out? There are clearly
results to concatenate with it.
A main problem here is that .concat() returns a new array. It doesn't add items onto the existing array.
You can change this:
issues.concat(JSON.parse(body).issues);
to this:
issues = issues.concat(JSON.parse(body).issues);
to make sure you are retaining the new concatenated array. This is a very common mistake.
You also potentially have sequencing issues in your array because you are running a for loop which is starting a whole bunch of requests at the same time and results may or may not arrive back in the proper order. You will still get the proper total number of issues, but they may not be in the order requested. I don't know if that is a problem for you or not. If that is a problem, we can also suggest a fix for that.
A more general question: is this approach the best way to go about
what I'm trying to achieve? I figured that even if my code did work,
the nature of asynchronicity means it's entirely possible for the
results to be concatenated in the wrong order.
Except for the ordering issue which can also be fixed, this is a reasonable way to do things. We would have to know more about your API to know if this is the most efficient way to use the API to get your results. Usually, you want to avoid making N repeated API calls to the same server and you'd rather make one API call to get all the results.
Should I just use a synchronous request library?
Absolutely not. node.js requires learning how to do asynchronous programming. It is a learning step for most people, but is how you get the best performance from node.js and should be learned and used.
Here's a way to collect all the results in reliable order using promises for synchronization and error propagation (which is hugely useful for async processing in node.js):
// promisify the request() function so it returns a promise
// whose fulfilled value is the request result
function requestP(url) {
return new Promise(function(resolve, reject) {
request(url, function(err, response, body) {
if (err || response.statusCode !== 200) {
reject({err: err, response: response});
} else {
resolve({response: response, body: body});
}
});
});
}
Donedone.prototype.getAllActiveIssues = function() {
var url = this.url;
return requestP(url + `/issues/all_active.json?take=500`).then(function(results) {
var data = JSON.parse(results.body);
var totalIssues = data.total_issues;
var promises = [];
for (let i = 0; i < totalIssues; i+= 500) {
promises.push(requestP(url + `/issues/all_active.json?skip=${i}&take=500`).then(function(results) {
return JSON.parse(results.body).issues;
}));
}
return Promise.all(promises).then(function(results) {
// results is an array of each chunk (which is itself an array) so we have an array of arrays
// now concat all results in order
return Array.prototype.concat.apply([], results);
})
});
}
xxx.getAllActiveIssues().then(function(issues) {
// process issues here
}, function(err) {
// process error here
})

Returned method call is undefined?

Ok so i am using a method to make a request and pull some tables from another URL
Meteor.methods({
gimmetitle: function () {
var url = 'http://wiki.warthunder.com/index.php?title=B-17G_Flying_Fortress';
request(url, function(err, response, body) {
$ = cheerio.load(body);
var text = $('.flight-parameters td').text();
console.log(text);
return text;
});
}
});
When called the td's in the table succesfully print to the server console: http://prntscr.com/721pjh
Buuut, when that text is returned from that method to this client code, undefined is printed to the console:
Template.title.events({
'click #thebutton': function () {
Meteor.call('gimmetitle', function(error, result){
Session.set('gogle', result);
});
var avar = Session.get('gogle');
console.log(avar);
}
});
Ideas?
You need to understand two different things here :
On the client side, making some calls to the server is always asynchronous, because we have to deal with network latency. That's why we use callbacks to fetch the result of Meteor methods : this code is executed some time in the future, not right away.
This is why Session.set('gogle', result); is actually executed AFTER var avar = Session.get('gogle'); even though it appears before in your event handler code flow.
Contrary to template helpers, event handlers are NOT reactive, so it means that when you set the Session variable to the result of the method, the event handler code is not automatically reexecuted with the new value of Session.get('gogle').
You'll need to either do something with the result right in the Meteor method callback, or use a reactive computation (template helpers or Tracker.autorun) depending on Session.get('gogle') to rerun whenever the reactive data source is modified, and use the new value fetched from the server and assigned to the Session variable.
Quick update..Was able to fix this with just 1 line of code lol.
instead of request(url, function(err, response, body) i used the froatsnook:request package and used var result = request.getSync(url, {encoding: null}); and then just replaced $ = cheerio.load(body); with $ = cheerio.load(result.body);.

Node.js keeps a copy of a global variable inside local scope

I have a Node.js application that uses the request library to GET a number of URLs. The code is something like this:
for(var i = 0; i < docs.length; i++){
var URL = docs[i].url;
request(URL, function(error, response, html){
console.log(URL);
// Other code...
}
For the sake of simplicity, let’s say docs contain URLs like [URL1, URL2, URL3, ...]. With the first iteration, URL = URL1 and a request is sent to that URL. With the second iteration, a request is sent to URL2 and so on. However, at the end of the loop, URL = URLn. Inside the event complete function, when I log URL, I always get URLn. However, I need to able to get the respective URLs that is [URL1, URL2, URL3, ...].
Any idea, how I can maintain a local copy of the URL, that remains unchanged event when the global URL gets changed?
This must be something easy, but I can't figure it out.
Basically, what you experience is normal behavior in JavaScript and no special behavior of Node.js.
The only thing in JavaScript that defines a scope are functions, and functions can access their own scope as well as any "outer" scopes.
Hence, the solution is to wrap your code in a function that takes the global variable as a parameter and provides it to the code within the function as a parameter: This one is evaluated for each call of the function, hence your inner code will get its own "copy".
Basically you have two options. Either use an immediate executed function expression. This is basically nothing but a name-less (i.e. anonymous) function that is called immediately where it is defined:
for(var i = 0; i < docs.length; i++){
(function (url) {
request(url, function(error, response, html){
console.log(url);
});
})(doc.url);
}
Or use the built-in forEach function of an array which automatically wraps its body in a function (which results in the same effect):
docs.forEach(function (url) {
request(url, function(error, response, html){
console.log(url);
});
});
You should read about closures in JavaScript here.
Meanwhile, in simpler terms, the value of i would reach n at the end of all iterations. Hence rightly you would get URLn every time. If you wrap the request inside a immediately invoked function expression, you are creating another level in the scope chain. By doing so, the callback to request method won't refer to the variable i in global scope, but to the variable i that was available in the scope of the function at the time of sending the request. And that value of i you expect.
Code would be something like this then:
for(var i = 0; i < docs.length; i++){
var URL = docs[i].url;
(function(currentURL) {
//now the URL is preserved as currentURL inside the scope of the function
request(currentURL, function(error, response, html){
console.log(currentURL);
// This value of currentURL is the one that was available in the scope chain
// Other code...
});
})(URL);
}
Just wrap the code in a function or use forEach. This happens because of closure scope.
docs.forEach(functiom(doc) {
var URL = doc.url;
request(URL, function(error, response, html){
console.log(URL);
// Other code...
})
});
Another fix
for(var i = 0; i < docs.length; i++){
makeRequest(docs[i]);
}
function makeRequest(doc) {
var URL = doc.url;
request(URL, function(error, response, html){
console.log(URL);
});
}
And another a bit more uglier fix with a closure inside the for loop
for(var i = 0; i < docs.length; i++){
(function(doc) {
var URL = doc.url;
request(URL, function(error, response, html){
console.log(URL);
// Other code...
});
})(docs[i]);
}
If you use something like JSHint, it will warn you not to create functions inside for loops as it will cause problems like this.
Just use let instead of var, i.e.:
for(let i = 0; i < docs.length; i++){
let URL = docs[i].url;
request(URL, function(error, response, html){
console.log(URL);
//other code...
}

Resources