Node.js + Cheerio : Request inside a loop

Node.js + Cheerio : Request inside a loop - node.js

I'm using cheerio, request and Node.js.
When I run the script below, it outputs names in a wrong order. I believe that it's caused by asynchronous nature of it, how can I make it work in the "right" order? Do I need to use a sync package or is there a way to change it in a way so it'll work in a sync way?
app.get('/returned', function (req, res) {
for (var y = 0; y < 10; y++) {
var url = "http://example.com" + y + "/person.html";
request(url, function (err, resp, body) {
$ = cheerio.load(body);
var links = $('#container');
var name = links.find('span[itemprop="name"]').html(); // name
if (name == null) {
console.log("returned null");
} else {
console.log(name);
}
});
}
});

Promise makes this relatively easy:
app.get('/returned', function (req, res) {
let urls = [];
for (let y = 0; y < 10; y++) {
urls.push('http://example.com' + y + '/person.html');
}
Promise.all(urls.map(function (url) {
return new Promise(resolve, reject) {
request(url, function (err, resp, body) {
if (err) {return reject(err);}
let $ = cheerio.load(body);
let links = $('#container');
let name = links.find('span[itemprop="name"]').html(); // name
resolve({name: name, links: links, url: url});
});
});
}).then(function (result) {
result.forEach(function (obj) {
if (obj.name == null) {
console.log(obj.url, "returned null");
} else {
console.log(obj.url, obj.name);
}
});
}).catch(function (err) {
console.log(err);
});
});
I started by creating an array of urls to get, then I mapped that to an array of promises. When each of the requests are complete, i resolved the promise with the name, url, and links. When all promises were complete, I then looped over the result which will will be in the original order. This runs in parallel.

Nope, you shouldn't have to use a sync package. IMO the cleanest way is to use a mature 3rd party library.
I'd recommend async.
The async.series method would execute all request functions in the order they are given, then allow you to register a callback to fire when all requests have been made, or when an error has occurred.
https://github.com/caolan/async#seriestasks-callback

Related

Requesting many connections to another site with Cross Origin Resource Sharing activated close in time makes the data corrupted

I am making a node.js application and part of my code requests for data from 193 different urls to download the json data from each url. Here is one of those urls: https://www.gemeentegeschiedenis.nl/gemeentenaam/json/Apeldoorn For the some the downloaded json data is fine and is complete. However towards the end, corruptions happen for some of the files. Part of the data becomes nullified and then there are some that have database errors. I think it has to do with requesting data from so many urls in a short amount of time (which is why I tried the "setTimeout" function (but that doesn't really work)).
function writeToFile(url) {
// get name to make each new file unique
var name = url.split("json/")[1];
var fileStream = fs.createWriteStream(`jsonFiles/${name}.json`);
var options = {
url: `${url}`,
method: 'GET',
headers: {
'Accept': 'application/json',
'Accept-Charset': 'utf-8',
json: true
}
}
//request the data from the site and download to the file.
request.get(options).pipe(fileStream);
}
function getMunicipalityGeoJsonData(req, res) {
//Get all the urls pointing to the JSON data for the province, Gelderland
getGelderlandJsonUrls((err, jsonUrls) => {
//for all those urls, write the data to files.
for (url of jsonUrls) {
console.log(url);
writeToFile(url);
}
})
}
function getGelderlandJsonUrls(callback) {
getMunicipalityJsonUrls("Gelderland", (err, data) => {
jsonUrls = data;
callback(null, jsonUrls);
});
}
function getMunicipalityJsonUrls(provinceName, callback) {
request({ uri: `https://www.gemeentegeschiedenis.nl/provincie/json/${provinceName}` }, (error, response, body) => {
body = JSON.parse(body);
// extracting each json URL from all the municipalities in Gelderland
var jsonUrls = [];
var numberMun = body.length;
for (var i = 0; i < numberMun; i++) {
var url = body[i].uri.naam;
var urlSplit = url.split("gemeentenaam");
var jsonUrl = urlSplit[0] + "gemeentenaam/json" + urlSplit[1];
jsonUrl = jsonUrl.replace("http://", "https://");
jsonUrls.push(jsonUrl);
}
callback(null, jsonUrls);
});
}
The last json data downloaded into the file as an html page with a database error from the url: https://www.gemeentegeschiedenis.nl/gemeentenaam/json/Zutphen which actually just took just under 6 seconds to load up looking at the network tab on Chrome
the 1812 has null for its properties when it should have a bunch of coordinates https://www.gemeentegeschiedenis.nl/gemeentenaam/json/Winssen (took just over a second to load on chrome
I am a noob at node, but please help me fix this issue maybe with some sort of checking if the data is corrupted or something. Thanks for the help in advanced:)
EDIT: I am trying to do up to 200 urls at a time in the for loop.

First off, add proper error handling to getMunicipalityJsonUrls() and to getGelderlandJsonUrls(). This means:
Check err parameter everywhere it's present and propagate the error back to the caller.
Capture possible errors from JSON.parse()
Check http statusCode.
Here's that fixed up code:
function getMunicipalityJsonUrls(provinceName, callback) {
request({ uri: `https://www.gemeentegeschiedenis.nl/provincie/json/${provinceName}` }, (error, response, body) => {
if (err) {
callback(err);
return;
}
if (response.statusCode !== 200) {
callback(new Error(`http status code ${response.statusCode}`));
return;
}
try {
const jsonUrls = JSON.parse(body).map(url => {
let urlSplit = url.split("gemeentenaam");
let jsonUrl = urlSplit[0] + "gemeentenaam/json" + urlSplit[1];
return jsonUrl.replace("http://", "https://");
});
callback(null, jsonUrls);
} catch(e) {
callback(e);
}
});
}
function getGelderlandJsonUrls(callback) {
getMunicipalityJsonUrls("Gelderland", (err, data) => {
if (err) {
callback(err);
} else {
callback(null, data);
}
});
}
Then, in writeToFile(), add error handling and completion monitoring and I chose to wrap it in a promise rather than a plain callback because I want to use it with some utilities that work with promises.
function writeToFile(url) {
return new Promise((resolve, reject) => {
// get name to make each new file unique
var name = url.split("json/")[1];
var fileStream = fs.createWriteStream(`jsonFiles/${name}.json`);
fileStream.on('error', (e) => {
reject(e);
});
var options = {
url: `${url}`,
method: 'GET',
headers: {
'Accept': 'application/json',
'Accept-Charset': 'utf-8',
json: true
}
}
//request the data from the site and download to the file.
request.get(options).pipe(fileStream).on('error', (e) => {
reject(e);
}).on('finish', () => {
resolve(url);
});
});
}
Now, we need to decide how to loop through all the URLs. If any of the urls could ever be attempting to write to the same file (if that's even a remote possibility), then you have to serialize the URLs to prevent them from ever having more than one asynchronous operation trying to write to the same file at the same time because that will just mess up that file. So, if that was the case, you could serialize the writing to the file like this:
// option 1 - serialize writing to files
async function getMunicipalityGeoJsonData(req, res) {
//Get all the urls pointing to the JSON data for the province, Gelderland
getGelderlandJsonUrls((err, jsonUrls) => {
if (err) {
console.log(err);
res.sendStatus(500);
} else {
try {
//for all those urls, write the data to files.
for (url of jsonUrls) {
console.log(url);
await writeToFile(url);
}
res.send("All done");
} catch(e) {
console.log(e);
res.sendStatus(500);
}
}
});
}
If you are absolutely sure that none of these URLs will ever cause writing to the same file, then you can run N of them at a time where you determine what the lowest value of N is that gets you decent performance. Higher values of N consume more peak resources (memory and file handles). Lower values of N run less things in parallel. If the target hostnames are all the same server, then usually you don't want N to be more than about 5. If the target hosts you are retrieving data from are all different, you can experiment with values of N up to maybe 20.
// option 2 - run N at a time in parallel
function getMunicipalityGeoJsonData(req, res) {
//Get all the urls pointing to the JSON data for the province, Gelderland
getGelderlandJsonUrls((err, jsonUrls) => {
if (err) {
console.log(err);
res.sendStatus(500);
} else {
//for all those urls, write the data to files.
const numConcurrent = 5;
mapConcurrent(jsonUrls, numConcurrent, writeToFile).then(() => {
res.send("All done");
}).catch(err => {
console.log(err);
res.sendStatus(500);
});
}
})
}
The mapConcurrent() function comes from this answer Promise.all consumes all my RAM and is as follows. It expects you to pass it an array of items to be iterated over, the max you want in flight at the same time and a function that will be passed an array item and will return a promise connected to when it's done or has an error:
function mapConcurrent(items, maxConcurrent, fn) {
let index = 0;
let inFlightCntr = 0;
let doneCntr = 0;
let results = new Array(items.length);
let stop = false;
return new Promise(function(resolve, reject) {
function runNext() {
let i = index;
++inFlightCntr;
fn(items[index], index++).then(function(val) {
++doneCntr;
--inFlightCntr;
results[i] = val;
run();
}, function(err) {
// set flag so we don't launch any more requests
stop = true;
reject(err);
});
}
function run() {
// launch as many as we're allowed to
while (!stop && inflightCntr < maxConcurrent && index < items.length) {
runNext();
}
// if all are done, then resolve parent promise with results
if (doneCntr === items.length) {
resolve(results);
}
}
run();
});
}
There are comparable functions in Bluebird's Promise.map() and in the Async library.
So, using this code you now have the ability to control how many of your requests/writeToFile() operations are in-process at the same time and you are capturing and logging all possible errors. Do, you can tune how many can be in flight at the same time for best performance and lowest resource use and, if there are any errors, you should be logging those errors so you can debug.
This code is currently set to stop processing any further URLs if it gets an error. You can change that if you want to continue on to the other URLs if you get an error by tweaking mapConcurrent(). But, I would still make sure you log any errors so you know when there are errors and can investigate why you are seeing errors.
One other note. If this was my code, I would convert everything to promises (no plain callbacks) and I'd use the got() library instead of the now deprecated request() library. I don't write any new code using the request() library.

Request in for loop - scope / responses out of order

After I perform a request to a .m3u8 Master Playlist Url like https://something.example.com/master.m3u8 with
request(input, function (error, response, body) {
...
}
and after some processing I get back a set of renditions, I store them into an object with some other things prepared, result is like
var set = {
input_url:'https://something.example.com/master.m3u8',
renditions:[
{
id:0,
url:'https://something.example.com/master_264.m3u8',
chunks:[]
}, {
id:1,
url:'https://something.example.com/master_578.m3u8',
chunks:[]
}, {
id:2,
url:'https://something.example.com/master_928.m3u8',
chunks:[]
}]
};
So I have my set of renditions stored in order from lowest to highest rendition. Now I need to perform another request for each single rendition and store their response into the chunks of each rendition.
Problem: After the first callback I get an error message TypeError: Cannot read property 'chunks' of undefined - Here is my code
for (var i = 0; i < set.renditions.length; i++) {
var done = false;
request(set.renditions[i].url, function(error, response, body) {
set.renditions[i].chunks.push(body);
completed_requests++;
if (completed_requests == set.renditions.length) {
return callback(null,set);
}
});
}
I believe this has something to do with a scope ? If am not referencing to original var set within the callback, but push the repsonses to another array, just as they come in, it works, but then they get stored out of order of course. So first the repsonse of 5th rendition, then 3rd rendition, then 4th rendition and so on (as it is an asynch operation) .
Any ideas how I should solve this, maybe something completely different ? Best practice ? Thanks !
=======================
First Try:
I tried using a forEach, but seems I don't get any result from it ...
function chunks (rendition_set, callback) {
var request_complete = false;
var completed_requests = 0
rendition_set.renditions.forEach ( function (entry) {
request(entry.url, function(error, response, body) {
var split = body.split('\n');
for (var i = 0; i < split.length; i++) {
if (split[i].indexOf('.ts') != -1 ) {
entry.chunks.push(split[i]);
}
}
completed_requests++;
});
})
while(request_complete == false) {
if (completed_requests == rendition_set.renditions.length) {
request_complete = true;
return callback(null,rendition_set);
}
}
}

So I ended up using async which basically handles all requests for all urls automatically, waits until everything is finished and gives me the result back in one package & in order the request was made ... which is pretty neat.
async.map(urls, httpGet, function (err, res) {
if (err) return console.log(err);
return callback (null, res)
});

delaying requests using request and cheerio modules

So this is the code I used to crawl my pages (i'm using request and cheerio modules:
for (let j = 1; j < nbRequest; j++)
{
const currentPromise = new Promise((resolve, reject) => {
request(
`https://www.url${j}`,
(error, response, body) => {
if (error || !response) {
console.log("Error: " + error);
}
console.log("Status code: " + response.statusCode + ", Connected to the page");
var $ = cheerio.load(body);
let output = {
ranks: [],
names: [],
numbers: [],
};
$('td.rangCell').each(function( index ) {
if ($(this).text().trim() != "Rang")
{
output.ranks.push($(this).text().trim().slice(0, -1));
nbRanks = nb_ranks+1;
}
});
$('td.nameCell:has(label)').each(function( index ) {
output.names.push($(this).find('label.nameValue > a').text().trim());
});
$('td.numberCell').each(function( index ) {
if ($(this).text().trim() != "Nombre")
{
output.numbers.push($(this).text().trim());
}
});
console.log("HERE 1");
return resolve(output);
}
);
});
promises.push(currentPromise);
}
after that I'm parsing and saving the result in a csv file using a node module.
At this point i've been able to crawl about 100 pages, but when it comes to much bigger numbers (1000+) I'm receiving a 500 response meaning that i'm being kicked i think.
So i think the best solution is to delay requests, but i didn't find the solution.
Do you guys have any idea and how the code would look like ?

what you are looking for is called "Control Flow", you can achieve this by using async.queue for example.
If you add every request to the the queue you can control the amount of parallel requests with the amount of workers. And you could add setTimeouts to the final part of the request's callback to achieve the delaying of requests.
Additionally I'd suggest using a "crawler" package (instead of building your own) e.g. npm-crawler as they ship with build in rate-limiting and have already taken care of other things that you might face next :) e.g. user-agent pool
Update:
const async = require("async");
const delayTime = 1500; //wait 1,5 seconds after every new request
getRequestPromise(csvLine){
return new Promise( make you request here );
}
const asyncQueue = async.queue(function(task, callback) {
getRequestPromise(task).then(_ => {
setTimeout(() => {
callback(null);
}, delayTime);
});
}, 1); //1 one request at a time
for(csv){ //pseudo
asyncQueue.push(csv[i], () => {});
}
asyncQueue.drain = () => {
console.log("finished.");
};

JS Closures, Redis, loop, Async :: empty array

I give up on this. May some of the wise stackoverflow monks please fix my bugs?
Code is self explaining. Client sends room names, server does a redis lookup and pushes valid rooms to the array. After adding all the rooms, the list should be emitted to the client.
Problem is closure, async etc. based. I understand the problem but cannot get a workaround because the array needs to remain inside the function. Tricky.
Code:
function roomList(socket){
var roomlist = [], rooms = getRooms(), p = /pChannel_/;
redis.select(7, function(err,res){
for (var k in rooms){
if(rooms[k] != '' && p.test(rooms[k])){
var key = 'channel:'+rooms[k];
redis.hgetall(key, function (err, reply) {
if(reply){
var c = io.sockets.manager.rooms[rooms[k]];
roomlist.push( Array(reply['name'],c.length,reply['icon']) );
}
else { console.log('nothing found'); }
});
}
}
// here be dragons
console.log(roomlist);
socket.emit('roomList', roomlist);
});
}
Thanks.

C'mon guys. The OP explicitly said she/he is interested by understanding how things are supposed to work. And you don't need Q or async or any other 3rd party modules to implement this.
In the initial code, there are two problems:
with Javascript, the closure scope is at function level, not block level. A function must be introduced to define a proper closure. Here, a simple forEach can be used.
the final step (i.e. emit) is not run after the replies have been received from Redis. It must be called in the loop itself. In order to achieve it, it is required to count the items so that the inner callback can test whether the process is complete or not.
So here is another version:
function roomList(socket){
var roomlist = [], rooms = getRooms(), p = /pChannel_/;
redis.select(7, function(err,res){
var count = rooms.length
rooms.forEach( function(r) {
if( r != '' && p.test(r) ) {
var key = 'channel:'+r
redis.hgetall(key, function (err, reply) {
if(reply) {
var c = io.sockets.manager.rooms[r];
roomlist.push( Array(reply['name'],c.length,reply['icon']) );
} else {
console.log('nothing found');
}
if ( --count <= 0 ) {
// here be dragons
console.log(roomlist);
socket.emit('roomList', roomlist);
}
});
} else --count;
});
});
}

Looks like a job for async.map:
function roomList(socket){
var rooms = getRooms(), p = /pChannel_/;
redis.select(7, function(err, res) {
async.map(rooms, function(room, callback) {
if (room === '' || ! p.test(room))
return callback(null, null);
var key = 'channel:' + room;
var c = io.sockets.manager.rooms[room];
redis.hgetall(key, function (err, reply) {
if (err)
callback(err); // propagate Redis errors to final callback, don't know
// if you want that or not; use 'callback(null)' if not.
else
if (reply)
callback(err, Array(reply.name, c.length, reply.icon) );
else
callback(err, null);
});
}, function(err, roomlist) {
if (err)
// handle Redis errors...
// filter 'null' entries from roomlist
roomlist = roomlist.filter(function(room) { return room !== null });
console.log(roomlist);
socket.emit('roomList', roomlist);
});
});
}
(untested)

If you just want to wait for the room list to be fully built before emitting the response (as seems highly reasonable), and assuming Q to be available, then you should just need a few additional lines of Q magic plus a closure-forming wrapper around the inner code to maintain a reliable reference to a Q deferred at each pass of the for loop.
function roomList(socket) {
redis.select(7, function(err, res) {
var list = [],
rooms = getRooms(),
p = /pChannel_/,
promises = [];
for(var k in rooms) {
if(rooms[k] != '' && p.test(rooms[k])) {
(function(dfrd) {
promises.push(dfrd.promise);
var key = 'channel:' + rooms[k];
redis.hgetall(key, function(err, reply) {
if(reply) {
var c = io.sockets.manager.rooms[rooms[k]];
list.push( [reply['name'], c.length, reply['icon']] );
}
else {
console.log('nothing found');
}
dfrd.resolve();
});
})(Q.defer());
}
}
Q.all(promises).then(function() {
console.log(list);
socket.emit('roomList', list);
});
});
}

NodeJS async queue too fast (Slowing down async queue method)

I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?

You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.

var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);

Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.

I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})

A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Node.js + Cheerio : Request inside a loop - node.js

Related

Requesting many connections to another site with Cross Origin Resource Sharing activated close in time makes the data corrupted

Request in for loop - scope / responses out of order

delaying requests using request and cheerio modules

JS Closures, Redis, loop, Async :: empty array

NodeJS async queue too fast (Slowing down async queue method)

Categories

Resources