Async piping file to an HTTP request - node.js

I'm trying to send off interleaved GET and POST requests to a server, but the POST request is sending data from a file, which seems to throw off the timing.
var async = require('async');
var http = require('http');
var request = require('request');
var fs = require('fs');
var arr = [];
for (var i = 1; i <= 50; i++) {
arr.push(i);
}
var limitedAgent = new http.Agent({maxSockets: 6});
function processThenSendRequest(data, onfinish) {
request.get({
url: 'http://www.google.com',
pool: limitedAgent
}, (function(j) {
return function(err, res) {
console.log("GET: response from " + j);
};
})(data)).on('socket', (function(j) {
return function(socket) {
console.log("GET: socket assigned for " + j);
}
})(data));
var source = fs.createReadStream('README.md');
var postReq = request.post({
url: 'http://www.google.com',
pool: limitedAgent
}, (function(j) {
return function(err, res) {
console.log("POST: response from " + j);
};
})(data)).on('socket', (function(j) {
return function(socket) {
console.log("POST: socket assigned for " + j);
}
})(data));
// source.pipe(postReq);
setTimeout(function() {
onfinish(null, data);
}, 10000);
}
async.map(arr, processThenSendRequest, function(err, results) {
if (err) console.error(err);
console.log("finished");
});
The code as written above runs fine, with the GET and POST requests being sent out in alternating order, but if I uncomment the source.pipe(postReq) line, then all the GET requests are sent before all the POST requests.
Is there a solution to this issue? I could use async.mapLimit but that feels like a hack and that the solution should be through the request library - this impression may be based on a misunderstanding though.

Per my comment:
Because Node is entirely non-blocking (at least when written this way) you can't be sure anything will occur in order unless you run it in series. async.series can also do this for you, or async.eachSeries.
Further to that, since Node doesn’t wait for asynchronous activities to finish, each task gets queued up immediately, while the callbacks (the event completion event) will occur on a first-come-first-serve basis. In your case, since GET requests take far less time to go around than POST requests, that's why they're completing first.

Related

How to disconnect a socket after streaming data?

I am making use of "socket.io-client" and "socket.io stream" to make a request and then stream some data. I have the following code that handles this logic
Client Server Logic
router.get('/writeData', function(req, res) {
var io = req.app.get('socketio');
var nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
var nameNodeData = {};
async.waterfall([
checkForDataNodes,
readFileFromS3
], function(err, result) {
if (err !== null) {
res.json(err);
}else{
res.json("Finished Writing to DN's");
}
});
function checkForDataNodes(cb) {
nameNodeSocket.on('nameNodeData', function(data) {
nameNodeData = data;
console.log(nameNodeData);
cb(null, nameNodeData);
});
if (nameNodeData.numDataNodes === 0) {
cb("No datanodes found");
}
}
function readFileFromS3(nameNodeData, cb) {
for (var i in nameNodeData['blockToDataNodes']) {
var IP = nameNodeData['blockToDataNodes'][i]['ipValue'];
var dataNodeSocket = io.connect('http://'+ IP +":5000");
var ss = require("socket.io-stream");
var stream = ss.createStream();
var byteStartRange = nameNodeData['blockToDataNodes'][i]['byteStart'];
var byteStopRange = nameNodeData['blockToDataNodes'][i]['byteStop'];
paramsWithRange['Range'] = "bytes=" + byteStartRange.toString() + "-" + byteStopRange.toString();
//var file = require('fs').createWriteStream('testFile' + i + '.txt');
var getFileName = nameNodeData['blockToDataNodes'][i]['key'].split('/');
var fileData = {
'mainFile': paramsWithRange['Key'].split('/')[1],
'blockName': getFileName[1]
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
s3.getObject(paramsWithRange).createReadStream().pipe(stream);
//dataNodeSocket.disconnect();
}
cb(null);
}
});
Server Logic (that gets the data)
var dataNodeIO = require('socket.io')(server);
var ss = require("socket.io-stream");
dataNodeIO.on('connection', function(socket) {
console.log("Succesfully connected!");
ss(socket).on('sendData', function(stream, data) {
var IP = data['ipValue'];
var blockName = data['blockName'];
var mainFile = data['mainFile'];
dataNode.makeDir(mainFile);
dataNode.addToReport(mainFile, blockName);
stream.pipe(fs.createWriteStream(mainFile + '/' + blockName));
});
});
How can I properly disconnect the connections in function readFileFromS3. I have noticed using dataNodeSocket.disconnect() at the end does not work as I cannot verify the data was received on the 2nd server. But if I comment it out, I can see the data being streamed to the second server.
My objective is to close the connections in Client Server side
It appears that the main problem with closing the socket is that you weren't waiting for the stream to be done writing before trying to close the socket. So, because the writing is all asynchronous and finishes sometime later, you were trying to close the socket before the data had been written.
Also because you were putting asynchronous operations inside a for loop, you were also running all your operations in parallel which may not be exactly what you want as it makes error handling more difficult and server load more difficult.
Here's the code I would suggest that does the following:
Create a function streamFileFromS3() that streams a single file and returns a promise that will notify when it's done.
Use await in a for loop with that streamFileFromS3() to serialize the operations. You don't have to serialize them, but then you would have to change your error handling to figure out what to do if one errors while the others are already running and you'd have to be more careful about concurrency issues.
Use try/catch to catch any errors from streamFileFromS3().
Add error handling on the stream.
Change all occurrences of data['propertyName'] to data.propertyName. The only time you need to use brackets is if the property name contains a character that is not allowed in a Javascript identifier or if the property name is in a variable. Otherwise, the dot notation is preferred.
Add socket.io connection error handling logic for both socket.io connections.
Set returned status to 500 when there's an error processing the request
So, here's the code for that:
const ss = require("socket.io-stream");
router.get('/writeData', function(req, res) {
const io = req.app.get('socketio');
function streamFileFromS3(ip, data) {
return new Promise((resolve, reject) => {
const dataNodeSocket = io.connect(`http://${ip}:5000`);
dataNodeSocket.on('connect_error', reject);
dataNodeSocket.on('connect_timeout', () {
reject(new Error(`timeout connecting to http://${ip}:5000`));
});
dataNodeSocket.on('connection', () => {
// dataNodeSocket connected now
const stream = ss.createStream().on('error', reject);
paramsWithRange.Range = `bytes=${data.byteStart}-${data.byteStop}`;
const filename = data.key.split('/')[1];
const fileData = {
'mainFile': paramsWithRange.Key.split('/')[1],
'blockName': filename
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
// get S3 data and pipe it to the socket.io stream
s3.getObject(paramsWithRange).createReadStream().on('error', reject).pipe(stream);
stream.on('close', () => {
dataNodeSocket.disconnect();
resolve();
});
});
});
}
function connectError(msg) {
res.status(500).send(`Error connecting to ${NAMENODE_ADDRESS}`);
}
const nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
nameNodeSocket.on('connect_error', connectError).on('connect_timeout', connectError);
nameNodeSocket.on('nameNodeData', async (nameNodeData) => {
try {
for (let item of nameNodeData.blockToDataNodes) {
await streamFileFromS3(item.ipValue, item);
}
res.json("Finished Writing to DN's");
} catch(e) {
res.status(500).json(e);
}
});
});
Other notes:
I don't know what paramsWithRange is as it is not declared here and when you were doing everything in parallel, it was getting shared among all the connections which is asking for a concurrency issue. In my serialized implementation, it's probably safe to share it, but the way it is now bothers me as it's a concurrency issue waiting to happen.

limit concurrent operations nodejs

This is a web scraping code written in node js.
Will this code always keep 5 concurrent request when queue has enough urls?
Why the console shows otherwise?
var request = require("request");
var cheerio = require("cheerio");
var fs = require('fs');
var concurrent_requests = 0;
var queue = [];
var baseUrl = "https://angularjs.org/";
function makeApiCall(url){
if(url) {
queue.unshift(url);
}
if(concurrent_requests<5) {
var nextUrl = queue.pop();
if(nextUrl) {
concurrent_requests++;
request(nextUrl, function (error, response, body) {
var invalidUrl;
concurrent_requests--;
if(body) {
var $ = cheerio.load(body);
var anchors = $("a");
var data = "";
for (var i = 0; i < anchors.length; i++) {
url = $(anchors[i]).attr("href");
if(!url || url === "#" || url === "javascript:void(0)"){
invalidUrl = true;
}
else{
invalidUrl = false;
}
if (!invalidUrl) {
makeApiCall(url);
data += url + ", " + nextUrl + "\n";
}
}
//console.log(data);
fs.appendFile('urls.csv',data, function (err) {
if (err) throw err;
});
}
else{
makeApiCall();
}
});
}
}
console.log(concurrent_requests);
}
makeApiCall(baseUrl);
Becoz, you have condition that states not to request more than 5 with an if statement.
if(concurrent_requests<5) {
This solution is not scalable as will go over the stack after certain recursive calls.
Hope it helps.
You are using if condition to check if the count of concurrent
requests are less then five or not. But remember it is if statement,
not loop. That means it will be called only once.
You are making a recursive call to your function makeApiCall inside
the callback of the request. The callback of the request only runs
when the request is fulfilled.
With above two points in mind, in your if condition you check if concurrent_requests<5 then you call request method, and your program goes ideal. After sometime when the request id fulfilled, the callback of request runs, which after some logic calls the makeApiCall again. So in every call you are calling request only once and then wait for that to resolve and then only your program proceed for next request.
If you want concurrent request then use a loop like this
function makeApiCall(url){
if(url) {
queue.unshift(url);
}
// Use a loop here
while(concurrent_requests<5) {
var nextUrl = queue.pop();
if(nextUrl) {
concurrent_requests++;
request(nextUrl, function (error, response, body) {
var invalidUrl;
concurrent_requests--;
if(body) {
...
if (!invalidUrl) {
makeApiCall(url);
data += url + ", " + nextUrl + "\n";
}
}
...
}
else{
makeApiCall();
}
});
}
else{
// Remember to break out of loop when queue is empty to avoid infinite loop.
break;
}
}
console.log(concurrent_requests);
}

delaying requests using request and cheerio modules

So this is the code I used to crawl my pages (i'm using request and cheerio modules:
for (let j = 1; j < nbRequest; j++)
{
const currentPromise = new Promise((resolve, reject) => {
request(
`https://www.url${j}`,
(error, response, body) => {
if (error || !response) {
console.log("Error: " + error);
}
console.log("Status code: " + response.statusCode + ", Connected to the page");
var $ = cheerio.load(body);
let output = {
ranks: [],
names: [],
numbers: [],
};
$('td.rangCell').each(function( index ) {
if ($(this).text().trim() != "Rang")
{
output.ranks.push($(this).text().trim().slice(0, -1));
nbRanks = nb_ranks+1;
}
});
$('td.nameCell:has(label)').each(function( index ) {
output.names.push($(this).find('label.nameValue > a').text().trim());
});
$('td.numberCell').each(function( index ) {
if ($(this).text().trim() != "Nombre")
{
output.numbers.push($(this).text().trim());
}
});
console.log("HERE 1");
return resolve(output);
}
);
});
promises.push(currentPromise);
}
after that I'm parsing and saving the result in a csv file using a node module.
At this point i've been able to crawl about 100 pages, but when it comes to much bigger numbers (1000+) I'm receiving a 500 response meaning that i'm being kicked i think.
So i think the best solution is to delay requests, but i didn't find the solution.
Do you guys have any idea and how the code would look like ?
what you are looking for is called "Control Flow", you can achieve this by using async.queue for example.
If you add every request to the the queue you can control the amount of parallel requests with the amount of workers. And you could add setTimeouts to the final part of the request's callback to achieve the delaying of requests.
Additionally I'd suggest using a "crawler" package (instead of building your own) e.g. npm-crawler as they ship with build in rate-limiting and have already taken care of other things that you might face next :) e.g. user-agent pool
Update:
const async = require("async");
const delayTime = 1500; //wait 1,5 seconds after every new request
getRequestPromise(csvLine){
return new Promise( make you request here );
}
const asyncQueue = async.queue(function(task, callback) {
getRequestPromise(task).then(_ => {
setTimeout(() => {
callback(null);
}, delayTime);
});
}, 1); //1 one request at a time
for(csv){ //pseudo
asyncQueue.push(csv[i], () => {});
}
asyncQueue.drain = () => {
console.log("finished.");
};

Node.js: How to perform endless loop with async module

I need to make an HTTP call and then put the response in database. i should repeat it forever. i have been reading on async module but i didn't understood how to combine these actions along with the waiting for couple of seconds between each iteration.
Can someone help?
Thanks in advance.
Look into async.forever. Your code would look something like this:
var async = require("async");
var http = require("http");
//Delay of 5 seconds
var delay = 5000;
async.forever(
function(next) {
http.get({
host: "google.com",
path: "/"
}, function(response) {
// Continuously update stream with data
var body = "";
response.on("data", function(chunk) {
body += chunk;
});
response.on("end", function() {
//Store data in database
console.log(body);
//Repeat after the delay
setTimeout(function() {
next();
}, delay)
});
});
},
function(err) {
console.error(err);
}
);
Why using such a module only for doing this ? Why don't you just use setTimeout like:
function makeRequest() {
request(url, function(response) {
saveInDatabase(function() {
// After save is complete, use setTimeout to call again
// "makeRequest" a few seconds later (Here 1 sec)
setTimeout(makeRequest, 1000);
});
}
}
This code won't really work for the request and save part of course, it was just to give an example of what I was proposing.

NodeJS async queue too fast (Slowing down async queue method)

I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?
You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);
Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.
I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})
A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);

Resources