Nodejs: How can I optimize writing many files? - node.js

I'm working in a Node environment on Windows. My code is receiving 30 Buffer objects (~500-900kb each) each second, and I need to save this data to the file system as quickly as possible, without engaging in any work that blocks the receipt of the following Buffer (i.e. the goal is to save the data from every buffer, for ~30-45 minutes). For what it's worth, the data is sequential depth frames from a Kinect sensor.
My question is: What is the most performant way to write files in Node?
Here's pseudocode:
let num = 0
async function writeFile(filename, data) {
fs.writeFileSync(filename, data)
}
// This fires 30 times/sec and runs for 30-45 min
dataSender.on('gotData', function(data){
let filename = 'file-' + num++
// Do anything with data here to optimize write?
writeFile(filename, data)
}
fs.writeFileSync seems much faster than fs.writeFile, which is why I'm using that above. But are there any other ways to operate on the data or write to file that could speed up each save?

First off, you never want to use fs.writefileSync() in handling real-time requests because that blocks the entire node.js event loop until the file write is done.
OK, based on writing each block of data to a different file, then you want to allow multiple disk writes to be in process at the same time, but not unlimited disk writes. So, it's still appropriate to use a queue, but this time the queue doesn't just have one write in process at a time, it has some number of writes in process at the same time:
const EventEmitter = require('events');
class Queue extends EventEmitter {
constructor(basePath, baseIndex, concurrent = 5) {
this.q = [];
this.paused = false;
this.inFlightCntr = 0;
this.fileCntr = baseIndex;
this.maxConcurrent = concurrent;
}
// add item to the queue and write (if not already writing)
add(data) {
this.q.push(data);
write();
}
// write next block from the queue (if not already writing)
write() {
while (!paused && this.q.length && this.inFlightCntr < this.maxConcurrent) {
this.inFlightCntr++;
let buf = this.q.shift();
try {
fs.writeFile(basePath + this.fileCntr++, buf, err => {
this.inFlightCntr--;
if (err) {
this.err(err);
} else {
// write more data
this.write();
}
});
} catch(e) {
this.err(e);
}
}
}
err(e) {
this.pause();
this.emit('error', e)
}
pause() {
this.paused = true;
}
resume() {
this.paused = false;
this.write();
}
}
let q = new Queue("file-", 0, 5);
// This fires 30 times/sec and runs for 30-45 min
dataSender.on('gotData', function(data){
q.add(data);
}
q.on('error', function(e) {
// go some sort of write error here
console.log(e);
});
Things to consider:
Experiment with the concurrent value you pass to the Queue constructor. Start with a value of 5. Then see if raising that value any higher gives you better or worse performance. The node.js file I/O subsystem uses a thread pool to implement asynchronous disk writes so there is a max number of concurrent writes that will allow so cranking the concurrent number up really high probably does not make things go faster.
You can experiement with increasing the size of the disk I/O thread pool by setting the UV_THREADPOOL_SIZE environment variable before you start your node.js app.
Your biggest friend here is disk write speed. So, make sure you have a fast disk with a good disk controller. A fast SSD on a fast bus would be best.
If you can spread the writes out across multiple actual physical disks, that will likely also increase write throughput (more disk heads at work).
This is a prior answer based on the initial interpretation of the question (before editing that changed it).
Since it appears you need to do your disk writes in order (all to the same file), then I'd suggest that you either use a write stream and let the stream object serialize and cache the data for you or you can create a queue yourself like this:
const EventEmitter = require('events');
class Queue extends EventEmitter {
// takes an already opened file handle
constructor(fileHandle) {
this.f = fileHandle;
this.q = [];
this.nowWriting = false;
this.paused = false;
}
// add item to the queue and write (if not already writing)
add(data) {
this.q.push(data);
write();
}
// write next block from the queue (if not already writing)
write() {
if (!nowWriting && !paused && this.q.length) {
this.nowWriting = true;
let buf = this.q.shift();
fs.write(this.f, buf, (err, bytesWritten) => {
this.nowWriting = false;
if (err) {
this.pause();
this.emit('error', err);
} else {
// write next block
this.write();
}
});
}
}
pause() {
this.paused = true;
}
resume() {
this.paused = false;
this.write();
}
}
// pass an already opened file handle
let q = new Queue(fileHandle);
// This fires 30 times/sec and runs for 30-45 min
dataSender.on('gotData', function(data){
q.add(data);
}
q.on('error', function(err) {
// got disk write error here
});
You could use a writeStream instead of this custom Queue class, but the problem with that is that the writeStream may fill up and then you'd have to have a separate buffer as a place to put the data anyway. Using your own custom queue like above takes care of both issues at once.
Other Scalability/Performance Comments
Because you appear to be writing the data serially to the same file, your disk writing won't benefit from clustering or running multiple operations in parallel because they basically have to be serialized.
If your node.js server has other things to do besides just doing these writes, there might be a slight advantage (would have to be verified with testing) to creating a second node.js process and doing all the disk writing in that other process. Your main node.js process would receive the data and then pass it to the child process that would maintain the queue and do the writing.
Another thing you could experiment with is coalescing writes. When you have more than one item in the queue, you could combine them together into a single write. If the writes are already sizable, this probably doesn't make much difference, but if the writes were small this could make a big difference (combining lots of small disk writes into one larger write is usually more efficient).
Your biggest friend here is disk write speed. So, make sure you have a fast disk with a good disk controller. A fast SSD would be best.

I have written a service that does this extensively and the best thing you can do is to pipe the input data directly to the file (if you have an input stream as well).
A simple example where you download a file in such a way:
const http = require('http')
const ostream = fs.createWriteStream('./output')
http.get('http://nodejs.org/dist/index.json', (res) => {
res.pipe(ostream)
})
.on('error', (e) => {
console.error(`Got error: ${e.message}`);
})
So in this example there is no intermediate copying involved of the whole file. As the file is read in chunks from the remote http server it is written to the file on disk. This is much more efficient that downloading a whole file from the server, saving that in memory and then writing it to a file on disk.
Streams are a basis of many operations in Node.js so you should study those as well.
One other thing that you should investigate depending on your scenarios is UV_THREADPOOL_SIZE as I/O operations use libuv thread pool that is by default set to 4 and you might fill that up if you do a lot of writing.

Related

How can I limit the size of WriteStream buffer in NodeJS?

I'm using a WriteStream in NodeJS to write several GB of data, and I've identified the write loop as eating up ~2GB of virtual memory during runtime (which is the GC'd about 30 seconds after the loop finishes). I'm wondering how I can limit the size of the buffer node is using when writing the stream so that Node doesn't use up so much memory during that part of the code.
I've reduced it to this trivial loop:
let ofd = fs.openSync(fn, 'w')
let ws = fs.createWriteStream('', { fd: ofd })
:
while { /*..write ~4GB of binary formatted 32bit floats and uint32s...*/ }
:
:
ws.end()
The stream.write function will return a boolean value which indicate if the internal buffer is full. The buffer size is controlled by the option highWaterMark. However, this option is a threshold instead of a hard limitation, which means you can still call stream.write even if the internal buffer is full, and the memory will be used continuously if you code like this.
while (foo) {
ws.write(bar);
}
In order to solve this issue, you have to handle the returned value false from the ws.write and waiting until the drain event of this stream is called like the following example.
async function write() {
while (foo) {
if (!ws.write(bar)) {
await new Promise(resolve => ws.once('drain', resolve));
}
}
}

async.queue concurrent tasks

I am using async.queue to ensure that certain file copies in a service happen at most n concurrently, but watching the files copy sometimes I see a lot more than what the queue allows. Does anyone see something I may have missed in the below implementation?
createQueue(limit: number) {
let self = this;
return async.queue(function(cmdObj, callback) {
console.log("Beginning copy");
let cmd = cmdObj.cmd;
let args = cmdObj.args;
let request = cmdObj.req;
request.state = State.IN_PROGRESS;
self.reportStatus(request.destination);
const proc = spawn(cmd, args); //uses an rsync command upstream
proc.on("close", code => {
if (code !== 0) {
request.state = State.ERRORED;
self.reportStatus(request.destination); // these just report to the caller
statusMap.delete(request.destination);
} else {
fs.rename(request.destination + ".part", request.destination);
request.state = State.COMPLETED;
self.reportStatus(request.destination); // same here
statusMap.delete(request.destination);
}
callback();
});
proc.on("error", err => {
console.error("COPY ERR: " + err);
});
}, limit); // limit here, for example, may be two, but I see four copies concurrently
}
EDIT:
I now believe this is a side effect of the rest of the system...queues being cleared and reinitialized AFTER copies have started...so when new items are added to the reinitialized queues, they kick off immediately, as the system has no idea if something has been handed off to userland and is currently running.
So, this was user error...PEBCAK! Posting the solution more as a cautionary tale:
The queues above were working as designed, but I had an endpoint for the calling server to clear the queues as necessary; the problem was i was using kill() and re-initializing the queues, losing all track of any jobs in progress and their callbacks. As soon as a new item hit the fresh queue, it would think nothing was happening and spawn a new copy process. I resolved by using remove to clear the queues instead of re-initializing.

How do I apply back-pressure to node streams?

While attempting to experiment with Node.JS streams I ran into an interesting conundrum. When the input (Readable) stream pushes more data then the destination (Writable) cares about I was unable to apply back-pressure correctly.
The two methods I attempted was to return false from the Writable.prototype._write and to retain a reference to the Readable so I can call Readable.pause() from the Writable. Neither solution helped much which I'll explain.
In my exercise (which you can view the full source as a Gist) I have three streams:
Readable - PasscodeGenerator
util.inherits(PasscodeGenerator, stream.Readable);
function PasscodeGenerator(prefix) {
stream.Readable.call(this, {objectMode: true});
this.count = 0;
this.prefix = prefix || '';
}
PasscodeGenerator.prototype._read = function() {
var passcode = '' + this.prefix + this.count;
if (!this.push({passcode: passcode})) {
this.pause();
this.once('drain', this.resume.bind(this));
}
this.count++;
};
I thought that the return code from this.push() was enough to self pause and wait for the drain event to resume.
Transform - Hasher
util.inherits(Hasher, stream.Transform);
function Hasher(hashType) {
stream.Transform.call(this, {objectMode: true});
this.hashType = hashType;
}
Hasher.prototype._transform = function(sample, encoding, next) {
var hash = crypto.createHash(this.hashType);
hash.setEncoding('hex');
hash.write(sample.passcode);
hash.end();
sample.hash = hash.read();
this.push(sample);
next();
};
Simply add the hash of the passcode to the object.
Writable - SampleConsumer
util.inherits(SampleConsumer, stream.Writable);
function SampleConsumer(max) {
stream.Writable.call(this, {objectMode: true});
this.max = (max != null) ? max : 10;
this.count = 0;
}
SampleConsumer.prototype._write = function(sample, encoding, next) {
this.count++;
console.log('Hash %d (%s): %s', this.count, sample.passcode, sample.hash);
if (this.count < this.max) {
next();
} else {
return false;
}
};
Here I want to consume the data as fast as possible until I reach my max number of samples and then end the stream. I tried using this.end() instead of return false but that caused the dreaded write called after end problem. Returning false does stop everything if the sample size is small but when it is large I get an out of memory error:
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - process out of memory
Aborted (core dumped)
According to this SO answer in theory the Write stream would return false causing the streams to buffer until the buffers were full (16 by default for objectMode) and eventually the Readable would call it's this.pause() method. But 16 + 16 + 16 = 48; that's 48 objects in buffer till things fill up and the system is clogged. Actually less because there is no cloning involved so the objects passed between them is the same reference. Would that not mean only 16 objects in memory till the high water mark halts everything?
Lastly I realize I could have the Writable reference the Readable to call it's pause method using closures. However, this solution means the Writable stream knows to much about another object. I'd have to pass in a reference:
var foo = new PasscodeGenerator('foobar');
foo
.pipe(new Hasher('md5'))
.pipe(new SampleConsumer(samples, foo));
And this feels out of norm for how streams would work. I thought back-pressure was enough to cause a Writable to stop a Readable from pushing data and prevent out of memory errors.
An analogous example would be the Unix head command. Implementing that in Node I would assume that the destination could end and not just ignore causing the source to keep pushing data even if the destination has enough data to satisfy the beginning portion of the file.
How do I idiomatically construct custom streams such that when the destination is ready to end the source stream doesn't attempt to push more data?
This is a known issue with how _read() is called internally. Since your _read() is always pushing synchronously/immediately, the internal stream implementation can get into a loop in the right conditions. _read() implementations are generally expected to do some sort of async I/O (e.g. reading from disk or network).
The workaround for this (as noted in the link above) is to make your _read() asynchronous at least some of the time. You could also just make it async every time it's called with:
PasscodeGenerator.prototype._read = function(n) {
var passcode = '' + this.prefix + this.count;
var self = this;
// `setImmediate()` delays the push until the beginning
// of the next tick of the event loop
setImmediate(function() {
self.push({passcode: passcode});
});
this.count++;
};

Node.js Synchronous Library Code Blocking Async Execution

Suppose you've got a 3rd-party library that's got a synchronous API. Naturally, attempting to use it in an async fashion yields undesirable results in the sense that you get blocked when trying to do multiple things in "parallel".
Are there any common patterns that allow us to use such libraries in an async fashion?
Consider the following example (using the async library from NPM for brevity):
var async = require('async');
function ts() {
return new Date().getTime();
}
var startTs = ts();
process.on('exit', function() {
console.log('Total Time: ~' + (ts() - startTs) + ' ms');
});
// This is a dummy function that simulates some 3rd-party synchronous code.
function vendorSyncCode() {
var future = ts() + 50; // ~50 ms in the future.
while(ts() <= future) {} // Spin to simulate blocking work.
}
// My code that handles the workload and uses `vendorSyncCode`.
function myTaskRunner(task, callback) {
// Do async stuff with `task`...
vendorSyncCode(task);
// Do more async stuff...
callback();
}
// Dummy workload.
var work = (function() {
var result = [];
for(var i = 0; i < 100; ++i) result.push(i);
return result;
})();
// Problem:
// -------
// The following two calls will take roughly the same amount of time to complete.
// In this case, ~6 seconds each.
async.each(work, myTaskRunner, function(err) {});
async.eachLimit(work, 10, myTaskRunner, function(err) {});
// Desired:
// --------
// The latter call with 10 "workers" should complete roughly an order of magnitude
// faster than the former.
Are fork/join or spawning worker processes manually my only options?
Yes, it is your only option.
If you need to use 50ms of cpu time to do something, and need to do it 10 times, then you'll need 500ms of cpu time to do it. If you want it to be done in less than 500ms of wall clock time, you need to use more cpus. That means multiple node instances (or a C++ addon that pushes the work out onto the thread pool). How to get multiple instances depends on your app strucuture, a child that you feed the work to using child_process.send() is one way, running multiple servers with cluster is another. Breaking up your server is another way. Say its an image store application, and mostly is fast to process requests, unless someone asks to convert an image into another format and that's cpu intensive. You could push the image processing portion into a different app, and access it through a REST API, leaving the main app server responsive.
If you aren't concerned that it takes 50ms of cpu to do the request, but instead you are concerned that you can't interleave handling of other requests with the processing of the cpu intensive request, then you could break the work up into small chunks, and schedule the next chunk with setInterval(). That's usually a horrid hack, though. Better to restructure the app.

Reading file in segments of X number of lines

I have a file with a lot of entries (10+ million), each representing a partial document that is being saved to a mongo database (based on some criteria, non-trivial).
To avoid overloading the database (which is doing other operations at the same time), I wish to read in chunks of X lines, wait for them to finish, read the next X lines, etc.
Is there any way to use any of the fscallback-mechanisms to also "halt" progress at a certain point, without blocking the entire program? From what I can tell they will all run from start to finish with no way of stopping it, unless you stop reading the file entirely.
The issues is that because of the file size, memory also becomes an issue and because of the time the updates take, a LOT of the data will be held in memory exceeding the 1 GB limit and causing the program to crash. Secondarily, as I said, I don't want to queue 1 million updates and completely stress the mongo database.
Any and all suggestions welcome.
UPDATE: Final solution using line-reader (available via npm) below, in pseudo-code.
var lineReader = require('line-reader');
var filename = <wherever you get it from>;
lineReader(filename, function(line, last, cb) {
//
// Do work here, line contains the line data
// last is true if it's the last line in the file
//
function checkProcessed(callback) {
if (doneProcessing()) { // Implement doneProcessing to check whether whatever you are doing is done
callback();
}
else {
setTimeout(function() { checkProcessed(callback) }, 100); // Adjust timeout according to expecting time to process one line
}
}
checkProcessed(cb);
});
This is implemented to make sure doneProcessing() returns true before attempting to work on more lines - this means you can effectively throttle whatever you are doing.
I don't use MongoDB and I'm not an expert in using Lazy, but I think something like below might work or give you some ideas. (note that I have not tested this code)
var fs = require('fs'),
lazy = require('lazy');
var readStream = fs.createReadStream('yourfile.txt');
var file = lazy(readStream)
.lines // ask to read stream line by line
.take(100) // and read 100 lines at a time.
.join(function(onehundredlines){
readStream.pause(); // pause reading the stream
writeToMongoDB(onehundredLines, function(err){
// error checking goes here
// resume the stream 1 second after MongoDB finishes saving.
setTimeout(readStream.resume, 1000);
});
});
}

Resources