Grouping redis.get for 2ms and then executing by mget

Grouping redis.get for 2ms and then executing by mget - node.js

My application makes about 50 redis.get call to serve a single http request, it serves millions of request daily and application runs on about 30 pods.
When monitoring on newrelic i am getting 200MS average redis.get time, To Optimize this i wrote a simple pipeline system in nodejs which is simply a wrapper over redis.get and it pushes all the request in queue, and then execute the queue using redis.mget (getting all the keys in bulk).
Following is the code snippet:
class RedisBulk {
constructor() {
this.queue = [];
this.processingQueue = {};
this.intervalId = setInterval(() => {
this._processQueue();
}, 5);
}
clear() {
clearInterval(this.intervalId);
}
get(key, cb) {
this.queue.push({cb, key});
}
_processQueue() {
if (this.queue.length > 0) {
let queueLength = this.queue.length;
logger.debug('Processing Queue of length', queueLength);
let time = (new Date).getTime();
this.processingQueue[time] = this.queue;
this.queue = []; //empty the queue
let keys = [];
this.processingQueue[time].forEach((item)=> {
keys.push(item.key);
});
global.redisClient.mget(keys, (err, replies)=> {
if (err) {
captureException(err);
console.error(err);
} else {
this.processingQueue[time].forEach((item, index)=> {
item.cb(err, replies[index]);
});
}
delete this.processingQueue[time];
});
}
}
}
let redis_bulk = new RedisBulk();
redis_bulk.get('a');
redis_bulk.get('b');
redis_bulk.get('c');
redis_bulk.get('d');
My Question is: is this a good approach? will it help in optimizing redis get time? is there any other solution for above problem?
Thanks

I'm not a redis expert but judging by the documentation ;
MGET has the time complexity of
O(N) where N is the number of keys to retrieve.
And GET has the time complexity of
O(1)
Which brings both scenarios to the same end result in terms of time complexity in your scenario. Having a bulk request with MGET can bring you some improvements for the IO but apart from that looks like you have the same bottleneck.
I'd ideally split my data into chunks, responding via multiple http requests in async fashion if that's an option.
Alternatively, you can try calling GET with promise.all() to run GET requests in parallel, for all the GET calls you need.
Something like;
const asyncRedis = require("async-redis");
const client = asyncRedis.createClient();
function bulk() {
const keys = [];
return Promise.all(keys.map(client.get))
}

Related

Nodejs - Fire multiple API calls while limiting the rate and wait until they are all done

My issues
Launch 1000+ online API that limits the number of API calls to 10 calls/sec.
Wait for all the API calls to give back a result (or retry), it can take 5 sec before the API sends it data
Use the combined data in the rest of my app
What I have tried while looking at a lot of different questions and answers here on the site
Use promise to wait for one API request
const https = require("https");
function myRequest(param) {
const options = {
host: "api.xxx.io",
port: 443,
path: "/custom/path/"+param,
method: "GET"
}
return new Promise(function(resolve, reject) {
https.request(options, function(result) {
let str = "";
result.on('data', function(chunk) {str += chunk;});
result.on('end', function() {resolve(JSON.parse(str));});
result.on('error', function(err) {console.log("Error: ", err);});
}).end();
});
};
Use Promise.all to do all the requests and wait for them to finish
const params = [{item: "param0"}, ... , {item: "param1000+"}]; // imagine 1000+ items
const promises = [];
base.map(function(params){
promises.push(myRequest(params.item));
});
result = Promise.all(promises).then(function(data) {
// doing some funky stuff with dat
});
So far so good, sort of
It works when I limit the number of API requests to a maximum of 10 because then the rate limiter kicks in. When I console.log(promises), it gives back an array of 'request'.
I have tried to add setTimeout in different places, like:
...
base.map(function(params){
promises.push(setTimeout(function() {
myRequest(params.item);
}, 100));
});
...
But that does not seem to work. When I console.log(promises), it gives back an array of 'function'
My questions
Now I am stuck ... any ideas?
How do I build in retries when the API gives an error
Thank you for reading up to hear, you are already a hero in my book!

When you have a complicated control-flow using async/await helps a lot to clarify the logic of the flow.
Let's start with the following simple algorithm to limit everything to 10 requests per second:
make 10 requests
wait 1 second
repeat until no more requests
For this the following simple implementation will work:
async function rateLimitedRequests (params) {
let results = [];
while (params.length > 0) {
let batch = [];
for (i=0; i<10; i++) {
let thisParam = params.pop();
if (thisParam) { // use shift instead
batch.push(myRequest(thisParam.item)); // of pop if you want
} // to process in the
// original order.
}
results = results.concat(await Promise.all(batch));
await delayOneSecond();
}
return results;
}
Now we just need to implement the one second delay. We can simply promisify setTimeout for this:
function delayOneSecond() {
return new Promise(ok => setTimeout(ok, 1000));
}
This will definitely give you a rate limiter of just 10 requests each second. In fact it performs somewhat slower than that because each batch will execute in request time + one second. This is perfectly fine and already meet your original intent but we can improve this to squeeze a few more requests to get as close as possible to exactly 10 requests per second.
We can try the following algorithm:
remember the start time
make 10 requests
compare end time with start time
delay one second minus request time
repeat until no more requests
Again, we can use almost exactly the same logic as the simple code above but just tweak it to do time calculations:
const ONE_SECOND = 1000;
async function rateLimitedRequests (params) {
let results = [];
while (params.length > 0) {
let batch = [];
let startTime = Date.now();
for (i=0; i<10; i++) {
let thisParam = params.pop();
if (thisParam) {
batch.push(myRequest(thisParam.item));
}
}
results = results.concat(await Promise.all(batch));
let endTime = Date.now();
let requestTime = endTime - startTime;
let delayTime = ONE_SECOND - requestTime;
if (delayTime > 0) {
await delay(delayTime);
}
}
return results;
}
Now instead of hardcoding the one second delay function we can write one that accept a delay period:
function delay(milliseconds) {
return new Promise(ok => setTimeout(ok, milliseconds));
}
We have here a simple, easy to understand function that will rate limit as close as possible to 10 requests per second. It is rather bursty in that it makes 10 parallel requests at the beginning of each one second period but it works. We can of course keep implementing more complicated algorithms to smooth out the request pattern etc. but I leave that to your creativity and as homework for the reader.

How to run asynchronous tasks synchronous?

I'm developing an app with the following node.js stack: Express/Socket.IO + React. In React I have DataTables, wherein you can search and with every keystroke the data gets dynamically updated! :)
I use Socket.IO for data-fetching, so on every keystroke the client socket emits some parameters and the server calls then the callback to return data. This works like a charm, but it is not garanteed that the returned data comes back in the same order as the client sent it.
To simulate: So when I type in 'a', the server responds with this same 'a' and so for every character.
I found the async module for node.js and tried to use the queue to return tasks in the same order it received it. For simplicity I delayed the second incoming task with setTimeout to simulate a slow performing database-query:
Declaration:
const async = require('async');
var queue = async.queue(function(task, callback) {
if(task.count == 1) {
setTimeout(function() {
callback();
}, 3000);
} else {
callback();
}
}, 10);
Usage:
socket.on('result', function(data, fn) {
var filter = data.filter;
if(filter.length === 1) { // TEST SYNCHRONOUSLY
queue.push({name: filter, count: 1}, function(err) {
fn(filter);
// console.log('finished processing slow');
});
} else {
// add some items to the queue
queue.push({name: filter, count: filter.length}, function(err) {
fn(data.filter);
// console.log('finished processing fast');
});
}
});
But the way I receive it in the client console, when I search for abc is as follows:
ab -> abc -> a(after 3 sec)
I want it to return it like this: a(after 3sec) -> ab -> abc
My thought is that the queue runs the setTimeout and then goes further and eventually the setTimeout gets fired somewhere on the event loop later on. This resulting in returning later search filters earlier then the slow performing one.
How can i solve this problem?

First a few comments, which might help clear up your understanding of async calls:
Using "timeout" to try and align async calls is a bad idea, that is not the idea about async calls. You will never know how long an async call will take, so you can never set the appropriate timeout.
I believe you are misunderstanding the usage of queue from async library you described. The documentation for the queue can be found here.
Copy pasting the documentation in here, in-case things are changed or down:
Creates a queue object with the specified concurrency. Tasks added to the queue are processed in parallel (up to the concurrency limit). If all workers are in progress, the task is queued until one becomes available. Once a worker completes a task, that task's callback is called.
The above means that the queue can simply be used to priorities the async task a given worker can perform. The different async tasks can still be finished at different times.
Potential solutions
There are a few solutions to your problem, depending on your requirements.
You can only send one async call at a time and wait for the first one to finish before sending the next one
You store the results and only display the results to the user when all calls have finished
You disregard all calls except for the latest async call
In your case I would pick solution 3 as your are searching for something. Why would you use care about the results for "a" if they are already searching for "abc" before they get the response for "a"?
This can be done by giving each request a timestamp and then sort based on the timestamp taking the latest.

SOLUTION:
Server:
exports = module.exports = function(io){
io.sockets.on('connection', function (socket) {
socket.on('result', function(data, fn) {
var filter = data.filter;
var counter = data.counter;
if(filter.length === 1 || filter.length === 5) { // TEST SYNCHRONOUSLY
setTimeout(function() {
fn({ filter: filter, counter: counter}); // return to client
}, 3000);
} else {
fn({ filter: filter, counter: counter}); // return to client
}
});
});
}
Client:
export class FilterableDataTable extends Component {
constructor(props) {
super();
this.state = {
endpoint: "http://localhost:3001",
filters: {},
counter: 0
};
this.onLazyLoad = this.onLazyLoad.bind(this);
}
onLazyLoad(event) {
var offset = event.first;
if(offset === null) {
offset = 0;
}
var filter = ''; // filter is the search character
if(event.filters.result2 != undefined) {
filter = event.filters.result2.value;
}
var returnedData = null;
this.state.counter++;
this.socket.emit('result', {
offset: offset,
limit: 20,
filter: filter,
counter: this.state.counter
}, function(data) {
returnedData = data;
console.log(returnedData);
if(returnedData.counter === this.state.counter) {
console.log('DATA: ' + JSON.stringify(returnedData));
}
}
This however does send unneeded data to the client, which in return ignores it. Somebody any idea's for further optimizing this kind of communication? For example a method to keep old data at the server and only send the latest?

Any way to reduce the amount of concurrent requests to fetch data and cache in nodejs?

I have an Express app which requires very low response rate ~<200ms. Right now we can only get this number but that's a separate topic.
We're planning to fetch a piece of data from the database, if found in Redis return the data if not then fire the request and save that to redis so the next requests can get it from Redis.
I'm running some testing and was wondering if there's a way to reduce the amount of database fetching requests?
For example, currently our application has 300req/s per box. We have six boxes running on AWS. If for the first time that piece of data is not available in Redis, there might be around ~500 requests trying to fetch the data from DB and cache that in Redis. We're trying to reduce that number down. Not sure if there's a way in Node.js or Redis to handle that.
Here's the code that I'm testing.
client.getAsync('key').then(function (data) {
if(data) {
console.log(data); // Return this data if found
res.send(data);
} else {
// I'm trying to reduce the number of calls for concurrent requests in this block.
console.log('not found');
var dataFromDb = // fetch data from DB
client.set('key', dataFromDb); // Fire and forget
res.send('not found'); // Return not found right away
}
});
And I test the call by using ab
ab -n 20 -c 10 http://localhost:8081/redis
This is the results I got
not found
not found
not found
not found
not found
not found
something
not found
something
something
something
something
something
something
something
something
something
something
In this example, there's 7 requests trying to fetch database with the same data and save to Redis.
My question is, is there anyway I can reduce the number of requests down? Because fetching DB is quite slow as of now ~900ms (We're trying to optimize that)

Yes there is. I did same thing. I will describe only logic here. Method to fetchCache should return a promise. Also you keep array of { cacheKey, promise }. Each time you send a request - you add key to this array. When next time you need to fetch cache - you check array first and if key there grabbing this promise. Else calling fetchCache.
Here is my code. It works, but probably hard to read. Should give you a basic understanding.
class DictTranslatableRepo {
constructor(model) {
var self = this;
self.title = model + "s Repo";
self.model = models[model];
self.running = {};
self.curItems = {};
}
*start() {
var self = this;
var curItems = yield self.model.findAll();
_.forEach(curItems, function(row) {
self.curItems[row.key] = row.value;
});
};
*map(from) {
var self = this;
if (from == "") return "";
if (!_.isUndefined(self.curItems[from])) return self.curItems[from];
if (_.isUndefined(self.running[from])) {
self.running[from] = [];
return new Promise(function(resolve, reject) {
self.running[from].push(resolve);
self.job(from, function(err, to) { // Main job
var callbackArr = self.running[from];
delete self.running[from];
_.forEach(callbackArr, function(callback) {
callback(to);
});
});
});
} else {
return new Promise(function(resolve, reject) {
self.running[from].push(resolve);
});
}
};
job(from, callback) {
var self = this;
var to = "as shown";
co(function*() {
try {
to = yield translator.translate(from);
yield self.model.add({key: from, value: to});
self.curItems[from] = to;
callback(null, to);
} catch (err) {
callback(err);
//logger.error("Cant translate entity: " + from);
}
}).catch(function(err) {
// Unhandled Error
callback(new Error(err));
});
};
}
My map method is your fetchCache method.

node asynchronous with response

function getResultsForOneDev(devID, res) {
var Contribution = require('../db/Contribution.js').model;
var SurveyState = require('../db/SurveyState.js').model;
var SurveyAnswer = require('../db/SurveyAnswer.js').model;
var contributionList = {
"dev": [ {
"contribs" : [ {
"surveyStates" : [ {
"surveyAnswers" : [ { } ]
} ]
} ]
} ]
};
Contribution.find({dev:devID}).exec(function (error, contribs){
// console.log("contribs:"+contribs);
contributionList = contribs;
console.log("contribs length:"+contribs.length);
for (var i = 0 ; i<contribs.length ; i++) {
(function(oneContrib) {
//console.log('contribs ID '+oneContrib._id);
SurveyState.find({contrib:oneContrib._id}).exec(function (error, surveyStates){
// console.log("surveyStates:"+surveyStates);
oneContrib.surveyStates = surveyStates;
console.log("surveyStates length:"+surveyStates.length);
for (var j = 0 ; j<surveyStates.length ; j++) {
(function(oneSurveyState) {
SurveyAnswer.find({surveyState:oneSurveyState._id}).exec(function (error, surveyAnswers){
// console.log("surveyAnswers:"+surveyAnswers);
oneSurveyState.surveyAnswers = surveyAnswers;
console.log("surveyAnswers length:"+surveyAnswers.length);
});
})(surveyStates[j]);
}
});
})(contribs[i]);
};
});
res.jsonp(contributionList);
}
This program does not run as I want, res.jsonp return empty contributionList.
I already try with async (https://github.com/caolan/async). What is the good pratice to fill contributionList before sending a res.jsonp ?

.find() is asynchronous. It returns immediately, before the callback has populated values into contributionList.
Move your res.jsonp() to the end of the callback code where contributionList is populated rather than outside the callback.
Since you seem to have multiple find() inside loops and whatnot, and you cannot guarantee the order the callbacks will run, you can use async (as you mention) to create a workflow to insure they all finish, and then run a final callback (executed by async) to invoke res.jsonp().

Because your database queries are asynchronous (they finish sometime later) and the rest of your code does not wait for them, your two for loops will finish long before the actual async responses will. As such, you have to actually keep track (somehow) of when the last async response is done and thus all the data in now in the contributionList data structure so you can now send your response.
My preference would be to use promises for this and Promise.all() to trigger an action when an arbitrary number of asynchronous operations are complete, but I don't know the database interfaces you're using to know which ones are promisified, so here's a generic method that simply uses a manual counter to keep track of how many async operations are still in flight and when the counter gets to zero, you have all the data now and you can send the response.
The additions to this code are the lines of code that use the variable remaining.
function getResultsForOneDev(devID, res) {
var Contribution = require('../db/Contribution.js').model;
var SurveyState = require('../db/SurveyState.js').model;
var SurveyAnswer = require('../db/SurveyAnswer.js').model;
var contributionList = {
"dev": [ {
"contribs" : [ {
"surveyStates" : [ {
"surveyAnswers" : [ { } ]
} ]
} ]
} ]
};
Contribution.find({dev:devID}).exec(function (error, contribs){
// console.log("contribs:"+contribs);
contributionList = contribs;
console.log("contribs length:"+contribs.length);
// keep track of how many async responses are left to be processed
// in a variable at a higher scope
var remaining = 0;
for (var i = 0 ; i<contribs.length ; i++) {
(function(oneContrib) {
//console.log('contribs ID '+oneContrib._id);
SurveyState.find({contrib:oneContrib._id}).exec(function (error, surveyStates){
// console.log("surveyStates:"+surveyStates);
oneContrib.surveyStates = surveyStates;
console.log("surveyStates length:"+surveyStates.length);
// add how many more responses are pending
remaining += surveyStates.length;
for (var j = 0 ; j<surveyStates.length ; j++) {
(function(oneSurveyState) {
SurveyAnswer.find({surveyState:oneSurveyState._id}).exec(function (error, surveyAnswers){
// console.log("surveyAnswers:"+surveyAnswers);
oneSurveyState.surveyAnswers = surveyAnswers;
console.log("surveyAnswers length:"+surveyAnswers.length);
// mark one more processed and see if all remaining ones are done
--remaining;
if (remaining === 0) {
res.jsonp(contributionList);
}
});
})(surveyStates[j]);
}
});
})(contribs[i]);
};
});
}
P.S. You should realize that you are somewhat flooding your database with a whole bunch of requests all at once (all attempting to run in parallel) and then sometime later the database will actually finish all of them. Depending upon the structure of the database and its ability to handle this flood of requests efficiently or share load with other users also using the database, this is sometimes not a best practice. So, sometimes it is better to send some small number of requests at once (e.g. 3-5) and each time one completes, you launch the next waiting request. The async library can do that type of management for you or you can fairly simply build your own little queue of requests and each time one finishes, you send another.

node.js process out of memory in http.request loop

In my node.js server i cant figure out, why it runs out of memory. My node.js server makes a remote http request for each http request it receives, therefore i've tried to replicate the problem with the below sample script, that also runs out of memory.
This only happens if the iterations in the for loop are very high.
From my point of view, the problem is related to the fact that node.js is queueing the remote http requests. How to avoid this?
This is the sample script:
(function() {
var http, i, mypost, post_data;
http = require('http');
post_data = 'signature=XXX%7CPSFA%7Cxxxxx_value%7CMyclass%7CMysubclass%7CMxxxxx&schedule=schedule_name_6569&company=XXXX';
mypost = function(post_data, cb) {
var post_options, req;
post_options = {
host: 'myhost.com',
port: 8000,
path: '/set_xxxx',
method: 'POST',
headers: {
'Content-Length': post_data.length
}
};
req = http.request(post_options, function(res) {
var res_data;
res.setEncoding('utf-8');
res_data = '';
res.on('data', function(chunk) {
return res_data += chunk;
});
return res.on('end', function() {
return cb();
});
});
req.on('error', function(e) {
return console.debug('TM problem with request: ' + e.message);
});
req.write(post_data);
return req.end;
};
for (i = 1; i <= 1000000; i++) {
mypost(post_data, function() {});
}
}).call(this);
$ node -v
v0.4.9
$ node sample.js
FATAL ERROR: CALL_AND_RETRY_2 Allocation failed - process out of memory
Tks in advance
gulden PT

Constraining the flow of requests into the server
It's possible to prevent overload of the built-in Server and its HTTP/HTTPS variants by setting the maxConnections property on the instance. Setting this property will cause node to stop accept()ing connections and force the operating system to drop requests when the listen() backlog is full and the application is already handling maxConnections requests.
Throttling outgoing requests
Sometimes, it's necessary to throttle outgoing requests, as in the example script from the question.
Using node directly or using a generic pool
As the question demonstrates, unchecked use of the node network subsystem directly can result in out of memory errors. Something like node-pool makes the active pool management attractive, but it doesn't solve the fundamental problem of unconstrained queuing. The reason for this is that node-pool doesn't provide any feedback about the state of the client pool.
UPDATE: As of v1.0.7 node-pool includes a patch inspired by this post to add a boolean return value to acquire(). The code in the following section is no longer necessary and the example with the streams pattern is working code with node-pool.
Cracking open the abstraction
As demonstrated by Andrey Sidorov, a solution can be reached by tracking the queue size explicitly and mingling the queuing code with the requesting code:
var useExplicitThrottling = function () {
var active = 0
var remaining = 10
var queueRequests = function () {
while(active < 2 && --remaining >= 0) {
active++;
pool.acquire(function (err, client) {
if (err) {
console.log("Error acquiring from pool")
if (--active < 2) queueRequests()
return
}
console.log("Handling request with client " + client)
setTimeout(function () {
pool.release(client)
if(--active < 2) {
queueRequests()
}
}, 1000)
})
}
}
queueRequests(10)
console.log("Finished!")
}
Borrowing the streams pattern
The streams pattern is a solution which is idiomatic in node. Streams have a write operation which returns false when the stream cannot buffer more data. The same pattern can be applied to a pool object with acquire() returning false when the maximum number of clients have been acquired. A drain event is emitted when the number of active clients drops below the maximum. The pool abstraction is closed again and it's possible to omit explicit references to the pool size.
var useStreams = function () {
var queueRequests = function (remaining) {
var full = false
pool.once('drain', function() {
if (remaining) queueRequests(remaining)
})
while(!full && --remaining >= 0) {
console.log("Sending request...")
full = !pool.acquire(function (err, client) {
if (err) {
console.log("Error acquiring from pool")
return
}
console.log("Handling request with client " + client)
setTimeout(pool.release, 1000, client)
})
}
}
queueRequests(10)
console.log("Finished!")
}
Fibers
An alternative solution can be obtained by providing a blocking abstraction on top of the queue. The fibers module exposes coroutines that are implemented in C++. By using fibers, it's possible to block an execution context without blocking the node event loop. While I find this approach to be quite elegant, it is often overlooked in the node community because of a curious aversion to all things synchronous-looking. Notice that, excluding the callcc utility, the actual loop logic is wonderfully concise.
/* This is the call-with-current-continuation found in Scheme and other
* Lisps. It captures the current call context and passes a callback to
* resume it as an argument to the function. Here, I've modified it to fit
* JavaScript and node.js paradigms by making it a method on Function
* objects and using function (err, result) style callbacks.
*/
Function.prototype.callcc = function(context /* args... */) {
var that = this,
caller = Fiber.current,
fiber = Fiber(function () {
that.apply(context, Array.prototype.slice.call(arguments, 1).concat(
function (err, result) {
if (err)
caller.throwInto(err)
else
caller.run(result)
}
))
})
process.nextTick(fiber.run.bind(fiber))
return Fiber.yield()
}
var useFibers = function () {
var remaining = 10
while(--remaining >= 0) {
console.log("Sending request...")
try {
client = pool.acquire.callcc(this)
console.log("Handling request with client " + client);
setTimeout(pool.release, 1000, client)
} catch (x) {
console.log("Error acquiring from pool")
}
}
console.log("Finished!")
}
Conclusion
There are a number of correct ways to approach the problem. However, for library authors or applications that require a single pool to be shared in many contexts it is best to properly encapsulate the pool. Doing so helps prevent errors and produces cleaner, more modular code. Preventing unconstrained queuing then becomes an evented dance or a coroutine pattern. I hope this answer dispels a lot of FUD and confusion around blocking-style code and asynchronous behavior and encourages you to write code which makes you happy.

yes, you trying to queue 1000000 requests before even starting them. This version keeps limited number of request (100):
function do_1000000_req( cb )
{
num_active = 0;
num_finished = 0;
num_sheduled = 0;
function shedule()
{
while (num_active < 100 && num_sheduled < 1000000) {
num_active++;
num_sheduled++;
mypost(function() {
num_active--;
num_finished++;
if (num_finished == 1000000)
{
cb();
return;
} else if (num_sheduled < 1000000)
shedule();
});
}
}
}
do_1000000_req( function() {
console.log('done!');
});

the node-pool module can help you. For more détails, see this post (in french), http://blog.touv.fr/2011/08/http-request-loop-in-nodejs.html

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string