I realize a great deal has been said on this topic, but I believe I am closing my connections, and would appreciate a new set of eyes to tell me if I am missing something.
an object containing the request information is created and passed to throttle
throttle puts it on a stack and starts an interval timer which calls process_queue every 1200ms; get_info is passed as the request callback.
At some point (varies) during execution, socket hang up and connect ETIMEDOUT exceptions are thrown, at which point I re-queue pending requests, wait a short interval, and start the process_queue again.
This works fine early on, but the occurance of exceptions seems to accelerate over time, and really, they shouldn't be happening anyhow if I am closing them correctly.
Any feedback is greatly appreciated.
var timer = null;
var takeabreather = null;
var stack = [];
var pendingqueue = [];
function throttle(item) {
stack.push(item);
pendingqueue.push(item);
if (timer === null) {
timer = setInterval(process_queue, 1200);
}
}
function process_queue() {
var item = stack.shift();
var req = http.request(item.opts, get_info);
req.on('error', function(e) {
logger.error('--- PROCESS_QUEUE: ERROR: ' + e);
req.end();
});
req.end(function(){
logger.debug('PROCESS_QUEUE: ENDING...');
})
// clear timer is there is no work left to do...
if (stack.length === 0) {
clearInterval(timer);
timer = null;
logger.info('PROCESS_QUEUE: queue is empty');
}
}
function get_info(response) {
var body = '';
response.on('data', function(d) {
body += d;
});
response.on('end', function() {
var parsed = JSON.parse(body);
var doc = {};
parsed.forEach(function (item) {
try {
doc.name = item.name;
}
catch (err) {
logger.error('--- GET_INFO ERROR: ', response.req.path, err);
}
});
// code to remove item from pending queue redacted //
logger.debug('--- GET_INFO END: ', response.req.path);
});
}
process.on('uncaughtException', function (e) {
logger.error('--- UNCAUGHT_EXCEPTION: ' + e);
clearInterval(timer);
timer = null;
if (takeabreather === null ) {
logger.warn('--- REQUEUING...');
stack = pendingqueue;
logger.warn('--- TAKING A BREATHER...' );
takeabreather = setTimeout(process_queue, 10000);
}
});
As it turns out, I had a nested http.request in get_info that was not being closed via .end()
Related
I am making use of "socket.io-client" and "socket.io stream" to make a request and then stream some data. I have the following code that handles this logic
Client Server Logic
router.get('/writeData', function(req, res) {
var io = req.app.get('socketio');
var nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
var nameNodeData = {};
async.waterfall([
checkForDataNodes,
readFileFromS3
], function(err, result) {
if (err !== null) {
res.json(err);
}else{
res.json("Finished Writing to DN's");
}
});
function checkForDataNodes(cb) {
nameNodeSocket.on('nameNodeData', function(data) {
nameNodeData = data;
console.log(nameNodeData);
cb(null, nameNodeData);
});
if (nameNodeData.numDataNodes === 0) {
cb("No datanodes found");
}
}
function readFileFromS3(nameNodeData, cb) {
for (var i in nameNodeData['blockToDataNodes']) {
var IP = nameNodeData['blockToDataNodes'][i]['ipValue'];
var dataNodeSocket = io.connect('http://'+ IP +":5000");
var ss = require("socket.io-stream");
var stream = ss.createStream();
var byteStartRange = nameNodeData['blockToDataNodes'][i]['byteStart'];
var byteStopRange = nameNodeData['blockToDataNodes'][i]['byteStop'];
paramsWithRange['Range'] = "bytes=" + byteStartRange.toString() + "-" + byteStopRange.toString();
//var file = require('fs').createWriteStream('testFile' + i + '.txt');
var getFileName = nameNodeData['blockToDataNodes'][i]['key'].split('/');
var fileData = {
'mainFile': paramsWithRange['Key'].split('/')[1],
'blockName': getFileName[1]
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
s3.getObject(paramsWithRange).createReadStream().pipe(stream);
//dataNodeSocket.disconnect();
}
cb(null);
}
});
Server Logic (that gets the data)
var dataNodeIO = require('socket.io')(server);
var ss = require("socket.io-stream");
dataNodeIO.on('connection', function(socket) {
console.log("Succesfully connected!");
ss(socket).on('sendData', function(stream, data) {
var IP = data['ipValue'];
var blockName = data['blockName'];
var mainFile = data['mainFile'];
dataNode.makeDir(mainFile);
dataNode.addToReport(mainFile, blockName);
stream.pipe(fs.createWriteStream(mainFile + '/' + blockName));
});
});
How can I properly disconnect the connections in function readFileFromS3. I have noticed using dataNodeSocket.disconnect() at the end does not work as I cannot verify the data was received on the 2nd server. But if I comment it out, I can see the data being streamed to the second server.
My objective is to close the connections in Client Server side
It appears that the main problem with closing the socket is that you weren't waiting for the stream to be done writing before trying to close the socket. So, because the writing is all asynchronous and finishes sometime later, you were trying to close the socket before the data had been written.
Also because you were putting asynchronous operations inside a for loop, you were also running all your operations in parallel which may not be exactly what you want as it makes error handling more difficult and server load more difficult.
Here's the code I would suggest that does the following:
Create a function streamFileFromS3() that streams a single file and returns a promise that will notify when it's done.
Use await in a for loop with that streamFileFromS3() to serialize the operations. You don't have to serialize them, but then you would have to change your error handling to figure out what to do if one errors while the others are already running and you'd have to be more careful about concurrency issues.
Use try/catch to catch any errors from streamFileFromS3().
Add error handling on the stream.
Change all occurrences of data['propertyName'] to data.propertyName. The only time you need to use brackets is if the property name contains a character that is not allowed in a Javascript identifier or if the property name is in a variable. Otherwise, the dot notation is preferred.
Add socket.io connection error handling logic for both socket.io connections.
Set returned status to 500 when there's an error processing the request
So, here's the code for that:
const ss = require("socket.io-stream");
router.get('/writeData', function(req, res) {
const io = req.app.get('socketio');
function streamFileFromS3(ip, data) {
return new Promise((resolve, reject) => {
const dataNodeSocket = io.connect(`http://${ip}:5000`);
dataNodeSocket.on('connect_error', reject);
dataNodeSocket.on('connect_timeout', () {
reject(new Error(`timeout connecting to http://${ip}:5000`));
});
dataNodeSocket.on('connection', () => {
// dataNodeSocket connected now
const stream = ss.createStream().on('error', reject);
paramsWithRange.Range = `bytes=${data.byteStart}-${data.byteStop}`;
const filename = data.key.split('/')[1];
const fileData = {
'mainFile': paramsWithRange.Key.split('/')[1],
'blockName': filename
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
// get S3 data and pipe it to the socket.io stream
s3.getObject(paramsWithRange).createReadStream().on('error', reject).pipe(stream);
stream.on('close', () => {
dataNodeSocket.disconnect();
resolve();
});
});
});
}
function connectError(msg) {
res.status(500).send(`Error connecting to ${NAMENODE_ADDRESS}`);
}
const nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
nameNodeSocket.on('connect_error', connectError).on('connect_timeout', connectError);
nameNodeSocket.on('nameNodeData', async (nameNodeData) => {
try {
for (let item of nameNodeData.blockToDataNodes) {
await streamFileFromS3(item.ipValue, item);
}
res.json("Finished Writing to DN's");
} catch(e) {
res.status(500).json(e);
}
});
});
Other notes:
I don't know what paramsWithRange is as it is not declared here and when you were doing everything in parallel, it was getting shared among all the connections which is asking for a concurrency issue. In my serialized implementation, it's probably safe to share it, but the way it is now bothers me as it's a concurrency issue waiting to happen.
Finding it almost impossible to capture the response of http requests in a loop as an array. I can see the array in console.log but when I pass the array to be the response of http server I get a blank array. What am I doing wrong , or are there better ways to do this?
Code :
router.route('/uprns').post(function(request, response){
response.setHeader('content-type', 'application/text');
console.log('first element from the array is '+request.body.UPRNS[0]);
console.log('Number of items in array is '+request.body.UPRNS.length);
if (request.body.UPRNS.length == 0) {
response.send( 'no UPRNS in request' );
}
var output = [];
var obj = '';
for( var i = 0; i < request.body.UPRNS.length; i++) {
obj = request.body.UPRNS[i];
//Make HTTP calls to
var options = {
host: 'orbisdigital.azure-api.net',
path: '/nosecurity/addresses?uprn='+obj // full URL as path
};
callback = function(res) {
res.on('data', function (chunk) {
output.push(chunk.toString());
});
//the whole response has been recieved
res.on('end', function () {
console.log(output);
});
}
Https.request(options, callback).end();
}
response.send(output);
});
I know there is a lot of talk about blocking process in a for loop , but there is no definitive recommended way to deal with http calls in a loop.
Thank you .
Here is the code. See the code for the added comments. Do some reading on asynchronous programming with node.js, here's a starter.
router.route( '/uprns' ).post( function ( request, response ) {
response.setHeader( 'content-type', 'application/text' );
console.log( 'first element from the array is ' + request.body.UPRNS[ 0 ] ); // your 1st element in JSON array.
console.log( 'Number of items in array is ' + request.body.UPRNS.length );
var output = [];
var obj = '';
for ( var i = 0; i < request.body.UPRNS.length; i++ ) {
obj = request.body.UPRNS[ i ];
console.log( obj );
//Make HTTP calls to
var options = {
host: 'orbisdigital.azure-api.net',
path: '/nosecurity/addresses?uprn=' + obj // full URL as path
};
Https.request( options, callback ).end();
}
var countResponses = 0;
// Don't make functions in a loop, so I moved this function down
// here.
function callback( res ) {
res.on( 'data', function ( chunk ) {
output.push( chunk.toString() );
});
// Handles an error
request.on('error', function(err) {
console.error(err.stack);
response.statusCode = 500; // or what ever.
response.send(500, 'there was an error');
});
//the whole response has been recieved
res.on( 'end', function () {
console.log( output );
countResponses++;
if (countResponses === request.body.UPRNS.length) {
// Previously this code was executed directly
// after the loop finished. It did not wait for
// all the responses, so it sent the empty response.
// However, the other console.log(output) statements
// were called after this.
//
// There is a bug here that if request.body.UPRNS.length
// is zero, then the user will never get a response. I
// let you fix this up :).
response.send( output );
}
} );
}
} );
Better way to handle such scenario is to use async.js instead of for loops. https://github.com/caolan/async
I have the following PHP Script on server that will wait 10 seconds and say Hello:
<php sleep(10); ?>Hello
On the client side (node), I have the following:
var http = require('http');
http.get ('http://example.com/sleep.php', function (resp) {
resp.on('data', function (d) {
console.log ('data!', d.toString());
});
resp.on('end', function (d) {
console.log ('Finished!');
});
}).on('error', function (e) {
console.log ('error:', e);
});
The problem is, if the internet connection stopped during the request, it will not trigger error OR end events.
To re-produce the problem:
Place the PHP script somewhere on the Internet
Execute the node script
Disconnect the Internet
The script does nothing
I've also found that if the connection is back within 10 seconds, it can still receive the message.
So, I made a simple interval loop to check the status. However it can't detect if the connection has stopped working or still waiting for response:
var http = require('http');
var lastRespond = 0, intervalCheck;
var respFinished = false;
http.get ('http://jixun.no-ip.org/sleep.php', function (resp) {
resp.on('data', function (d) {
lastRespond = new Date;
console.log ('data!', d.toString());
});
resp.on('end', function (d) {
respFinished = true;
console.log ('Finished!');
});
}).on('error', function (e) {
console.log ('error:', e);
});
intervalCheck = setInterval(function () {
if (respFinished) {
clearInterval(intervalCheck);
} else if (new Date - lastRespond >= 120000) {
console.log ('Timeout :(');
clearInterval(intervalCheck);
}
}, 120000); // 2 mins.
So my question is: Is there any way to check if the socket closed / connection stopped after sending the request?
Thanks in advance.
Using setTimeout on the actual request could solve your problem. Well, it's not an actual event like 'close', 'end' and 'error'. Sample below does reproduce and solves the issue, haven't tried it in another context though.
var http = require('http');
http.get ('http://fake-response.appspot.com?sleep=5', function (resp) {
resp.on('data', function (d) {
console.log ('data!', d.toString());
});
resp.on('end', function (d) {
console.log ('Finished!');
});
}).on('error', function (e) {
console.log ('error:', e);
}).setTimeout(12000, function ( ) {
console.log('Timeout reached...');
process.exit(1);
});
More information can be found in the documentation. Either use that or listening on the 'close' event as well, that works well with the net module.
Maybe this would be usefull for you:
Create a bash script which checks connection (grabbed from https://stackoverflow.com/a/14939373/1779015 with little modifications):
#!/bin/bash
# Test for network conection
for interface in $(ls /sys/class/net/ | grep -v lo);
do
if [[ $(cat /sys/class/net/$interface/carrier 2> /dev/null) = 1 ]]; then OnLine=1; fi
done
if ! [ $OnLine ]; then echo "0";
else
echo "1";
fi
Then call it from node script and read stdout, for example with child_process (http://nodejs.org/api/child_process.html):
var spawn = require('child_process').spawn,
hasInet = spawn('./test-inet.sh');
hasInet.stdout.pipe(process.stdout);
Here is my solution ... instead of using set interval I check if on "data" event keeps firing until END event.
var http = require('http');
var url = require('url');
var options = {};
var parsed_url = url.parse(url, true);
var req_options = {
path: parsed_url.pathname,
host: parsed_url.hostname
};
var file = fs.createWriteStream(filename,options);
try{
const check_data_timeout = 10000;
var check_data_timer = 0;
var current_size = 0;
var request = http.get(req_options, function(response) {
len = parseInt(response.headers['content-length'], 10);
response.on("data", function(chunk) {
current_size += chunk.length;
percent = (100.0 * current_size / len).toFixed(2);
console.log("Download percent : "+percent+"%");
clearTimeout(check_data_timer);
check_data_timer = setTimeout(function(){
console.log("UPS !! No new data ... connection must be stalled");
},check_data_timeout);
});
response.on("end", function() {
console.log("Response ENDED !!!");
clearTimeout(check_data_timer);
});
response.pipe(file);
check_data_timer = setTimeout(function(){
console.log("UPS !! No new data ... connection must be stalled");
},check_data_timeout);
}).once('error', function(error) {
console.log("Response ERROR !!!");
});
}catch(e){
console.log(e);
}
Hope it helps ...
I'm trying to convert an existing API to work with RxJS... fairly new to node, and very new to RxJs, so please bear with me.
I have an existing API (getNextMessage), that either blocks (asynchronously), or returns a new item or error via a node-style (err, val) callback, when the something becomes available.
so it looks something like:
getNextMessage(nodeStyleCompletionCallback);
You could think of getNextMessage like an http request, that completes in the future, when the server responds, but you do need to call getNextMessage again, once a message is received, to keep getting new items from the server.
So, in order to make it into an observable collection, I have to get RxJs to keep calling my getNextMessage function until the subscriber is disposed();
Basically, I'm trying to create my own RxJs observable collection.
The problems are:
I don't know how to make subscriber.dispose() kill the async.forever
I probably shouldn't be using async.forever in the first place
I'm not sure I should be even getting 'completed' for each message - shouldn't that be at the end of a sequence
I'd like to eventually remove the need for using fromNodeCallback, to have a first class RxJS observable
Clearly I'm a little confused.
Would love a bit of help, thanks!
Here is my existing code:
var Rx = require('rx');
var port = require('../lib/port');
var async = require('async');
function observableReceive(portName)
{
var observerCallback;
var listenPort = new port(portName);
var disposed = false;
var asyncReceive = function(asyncCallback)
{
listenPort.getNextMessage(
function(error, json)
{
observerCallback(error, json);
if (!disposed)
setImmediate(asyncCallback);
}
);
}
return function(outerCallback)
{
observerCallback = outerCallback;
async.forever(asyncReceive);
}
}
var receive = Rx.Observable.fromNodeCallback(observableReceive('rxtest'));
var source = receive();
var subscription = source.forEach(
function (json)
{
console.log('receive completed: ' + JSON.stringify(json));
},
function (error) {
console.log("receive failed: " + error.toString());
},
function () {
console.log('Completed');
subscription.dispose();
}
);
So here's probably what I would do.
var Rx = require('Rx');
// This is just for kicks. You have your own getNextMessage to use. ;)
var getNextMessage = (function(){
var i = 1;
return function (callback) {
setTimeout(function () {
if (i > 10) {
callback("lawdy lawd it's ova' ten, ya'll.");
} else {
callback(undefined, i++);
}
}, 5);
};
}());
// This just makes an observable version of getNextMessage.
var nextMessageAsObservable = Rx.Observable.create(function (o) {
getNextMessage(function (err, val) {
if (err) {
o.onError(err);
} else {
o.onNext(val);
o.onCompleted();
}
});
});
// This repeats the call to getNextMessage as many times (11) as you want.
// "take" will cancel the subscription after receiving 11 items.
nextMessageAsObservable
.repeat()
.take(11)
.subscribe(
function (x) { console.log('next', x); },
function (err) { console.log('error', err); },
function () { console.log('done'); }
);
I realize this is over a year old, but I think a better solution for this would be to make use of recursive scheduling instead:
Rx.Observable.forever = function(next, scheduler) {
scheduler = scheduler || Rx.Scheduler.default,
//Internally wrap the the callback into an observable
next = Rx.Observable.fromNodeCallback(next);
return Rx.Observable.create(function(observer) {
var disposable = new Rx.SingleAssignmentDisposable(),
hasState = false;
disposable.setDisposable(scheduler.scheduleRecursiveWithState(null,
function(state, self) {
hasState && observer.onNext(state);
hasState = false;
next().subscribe(function(x){
hasState = true;
self(x);
}, observer.onError.bind(observer));
}));
return disposable;
});
};
The idea here is that you can schedule new items once the previous one has completed. You call next() which invokes the passed in method and when it returns a value, you schedule the next item for invocation.
You can then use it like so:
Rx.Observable.forever(getNextMessage)
.take(11)
.subscribe(function(message) {
console.log(message);
});
See a working example here
I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?
You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);
Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.
I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})
A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);