How do I apply back-pressure to node streams? - node.js

While attempting to experiment with Node.JS streams I ran into an interesting conundrum. When the input (Readable) stream pushes more data then the destination (Writable) cares about I was unable to apply back-pressure correctly.
The two methods I attempted was to return false from the Writable.prototype._write and to retain a reference to the Readable so I can call Readable.pause() from the Writable. Neither solution helped much which I'll explain.
In my exercise (which you can view the full source as a Gist) I have three streams:
Readable - PasscodeGenerator
util.inherits(PasscodeGenerator, stream.Readable);
function PasscodeGenerator(prefix) {
stream.Readable.call(this, {objectMode: true});
this.count = 0;
this.prefix = prefix || '';
}
PasscodeGenerator.prototype._read = function() {
var passcode = '' + this.prefix + this.count;
if (!this.push({passcode: passcode})) {
this.pause();
this.once('drain', this.resume.bind(this));
}
this.count++;
};
I thought that the return code from this.push() was enough to self pause and wait for the drain event to resume.
Transform - Hasher
util.inherits(Hasher, stream.Transform);
function Hasher(hashType) {
stream.Transform.call(this, {objectMode: true});
this.hashType = hashType;
}
Hasher.prototype._transform = function(sample, encoding, next) {
var hash = crypto.createHash(this.hashType);
hash.setEncoding('hex');
hash.write(sample.passcode);
hash.end();
sample.hash = hash.read();
this.push(sample);
next();
};
Simply add the hash of the passcode to the object.
Writable - SampleConsumer
util.inherits(SampleConsumer, stream.Writable);
function SampleConsumer(max) {
stream.Writable.call(this, {objectMode: true});
this.max = (max != null) ? max : 10;
this.count = 0;
}
SampleConsumer.prototype._write = function(sample, encoding, next) {
this.count++;
console.log('Hash %d (%s): %s', this.count, sample.passcode, sample.hash);
if (this.count < this.max) {
next();
} else {
return false;
}
};
Here I want to consume the data as fast as possible until I reach my max number of samples and then end the stream. I tried using this.end() instead of return false but that caused the dreaded write called after end problem. Returning false does stop everything if the sample size is small but when it is large I get an out of memory error:
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - process out of memory
Aborted (core dumped)
According to this SO answer in theory the Write stream would return false causing the streams to buffer until the buffers were full (16 by default for objectMode) and eventually the Readable would call it's this.pause() method. But 16 + 16 + 16 = 48; that's 48 objects in buffer till things fill up and the system is clogged. Actually less because there is no cloning involved so the objects passed between them is the same reference. Would that not mean only 16 objects in memory till the high water mark halts everything?
Lastly I realize I could have the Writable reference the Readable to call it's pause method using closures. However, this solution means the Writable stream knows to much about another object. I'd have to pass in a reference:
var foo = new PasscodeGenerator('foobar');
foo
.pipe(new Hasher('md5'))
.pipe(new SampleConsumer(samples, foo));
And this feels out of norm for how streams would work. I thought back-pressure was enough to cause a Writable to stop a Readable from pushing data and prevent out of memory errors.
An analogous example would be the Unix head command. Implementing that in Node I would assume that the destination could end and not just ignore causing the source to keep pushing data even if the destination has enough data to satisfy the beginning portion of the file.
How do I idiomatically construct custom streams such that when the destination is ready to end the source stream doesn't attempt to push more data?

This is a known issue with how _read() is called internally. Since your _read() is always pushing synchronously/immediately, the internal stream implementation can get into a loop in the right conditions. _read() implementations are generally expected to do some sort of async I/O (e.g. reading from disk or network).
The workaround for this (as noted in the link above) is to make your _read() asynchronous at least some of the time. You could also just make it async every time it's called with:
PasscodeGenerator.prototype._read = function(n) {
var passcode = '' + this.prefix + this.count;
var self = this;
// `setImmediate()` delays the push until the beginning
// of the next tick of the event loop
setImmediate(function() {
self.push({passcode: passcode});
});
this.count++;
};

Related

Node.js memory leak when reading and writing large files

I am currently trying to implement SPIMI index construction method in Node and I have ran into an issue.
The code is the following:
let fs = require("fs");
let path = require("path");
module.exports = {
fileStream: function (dirPath, fileStream) {
return buildFileStream(dirPath, fileStream);
},
buildSpimi: function (fileStream, outDir) {
let invIndex = {};
let sortedInvIndex = {};
let fileNameCount = 1;
let outputTXT = "";
let entryCounter = 0;
let resString = "";
fileStream.forEach((filePath, fileIndex) => {
let data = fs.readFileSync(filePath).toString('utf-8');
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(function (ch) { return ch.length != 0; });
data.forEach(token => {
//CHANGE THE SIZE IF NECESSARY (4e+?)
if (entryCounter > 100000) {
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
resString = "";
entryCounter = 0;
sortedInvIndex = {};
invIndex = {};
console.log(outputTXT + " - written;");
fileNameCount++;
};
if (invIndex[token] == undefined) {
invIndex[token] = [];
entryCounter++;
};
if (!invIndex[token].includes(fileIndex)) {
invIndex[token].push(fileIndex);
entryCounter++;
};
});
});
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
console.log(outputTXT + " - written;");
}
}
function buildFileStream(dirPath, fileStream) {
fileStream = fileStream || 0;
fs.readdirSync(dirPath).forEach(function (file) {
let filepath = path.join(dirPath, file);
let stat = fs.statSync(filepath);
if (stat.isDirectory()) {
fileStream = buildFileStream(filepath, fileStream);
} else {
fileStream.push(filepath);
}
});
return fileStream;
}
I am using the exported functions in a separate file:
let spimi = require("./spimi");
let outputDir = "/Users/me/Desktop/SPIMI_OUT/"
let inputDir = "/Users/me/Desktop/gutenberg/2/2";
fileStream = [];
let result = spimi.fileStream(inputDir, fileStream);
console.table(result)
console.log("Finished building the filestream");
let t0 = new Date();
spimi.buildSpimi(result, outputDir);
let t1 = new Date();
console.log(t1 - t0);
While this code kind of works when trying on relatively small volumes of data (I tested up to 1.5 GB), there is obviously a memory leak somewhere, as when monitoring the RAM usage I can see it going up as far as to 4-5 GB).
I spent quite a lot of time trying to figure out what might be the cause, but I still couldn't find the issue.
I would appreciate any hints on this!
Thanks!
Something to understand about the language and garbage collection in general is that this:
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(...)
creates three additional copies of your data. First, an uppercase copy. Then, a split array copy. Then, a filtered copy of the split array.
So, at this point, you have four copies of your data all in memory. All, but the filtered array are now eligible for garbage collection when the GC gets a chance to run, but if this data was initially large, you're going to be using at least 3x-4x as much memory as the filesize (depending upon how many array items are removed in your .filter() operation).
None of this is a leak, but it's a very big peak memory usage which can be a problem.
A more memory efficient way to process large files is to process them as a stream (not read them all into memory at once). You read a small size chunk (say 1024 bytes), process it, read a chunk, process it while being careful about chunk boundaries. If your file naturally has line boundaries, there are already pre-built solutions for processing line by line. If not, you can create your own chunk processing mechanism. We would have to see a sample of your data to make more specific chunk processing suggestions.
As another point, if you end up with a lot of keys in invIndex, then this line of code starts to become inefficient and you're doing it in your loop:
Object.keys(invIndex).sort()
This takes your object and gets all the keys in a temporary array which you use only for the purposes of updating the sortedInvIndex which is yet another copy of your data. So, right there alone, this set of code makes three copies of all your keys and two copies of all the values. And, it does it every time through your loop. Again, lots of peak memory usage that the GC won't normally clean up until your function is done.
A redesign to the way you process this data could probably reduce the peak memory usage by a factor of 100x. For memory efficiency, you want only the initial data, the final data representation and then just a little more used for temporary transformations to over be in use at the same time. You don't want to EVER be processing all the data multiple times because each time you do that, it creates yet another entire copy of all the data that contributes to peak memory usage.
If you show what the data input looks like and what data structure you're trying to end up with, I could probably take a crack at a much more efficient implementation.
Mykhailo, adding on to what jfriend said, it's actually not a memory leak. It's working as intended.
Something to consider is that readFile buffers the entire file! This will cause the huge memory bloat. Better alternative is to implement fs.createReadStream() which will only buffer the part of the file you're currently reading. Unfortunately, implementing that solution may require a full rewrite of your code as it returns fs.ReadStream which won't behave the way you're currently handling files Checkout this link and read the bottom of the section to see what I'm referencing

Memory efficient growing Nodejs Duplex/Transform stream

I am trying to add variables into a template at specific indices through streams.
The idea is that I have a readable stream in and a list of variables that can be either a readable stream a buffer or a string of an undetermined size. These variables can be inserted at a predefined list of indices. I have a few questions based on my assumptions and what I have tried so far.
My first attempt was to do it manually with readable streams. However, I couldn't const buffer = templateIn.read(size) (since the buffers were still empty) before template combined was trying to read it. The solution for that problem is similar to how you'd use a transform stream so that was the next step I took.
However, I have a problem with the transform streams. My problem is that something like this pseudo code will pile up buffers into memory until done() is called.
public _transform(chunk: Buffer, encoding: string, done: (err?: Error, data?: any) => void ): void {
let index = 0;
while (index < chunk.length) {
if (index === this.variableIndex) { // the basic idea (the actual logic is a bit more complex)
this.insertStreamHere(index);
index++;
} else {
// continue reading stream normally
}
}
done()
}
From: https://github.com/nodejs/node/blob/master/lib/_stream_transform.js
In a transform stream, the written data is placed in a buffer. When
_read(n) is called, it transforms the queued up data, calling the
buffered _write cb's as it consumes chunks. If consuming a single
written chunk would result in multiple output chunks, then the first
outputted bit calls the readcb, and subsequent chunks just go into
the read buffer, and will cause it to emit 'readable' if necessary.
This way, back-pressure is actually determined by the reading side,
since _read has to be called to start processing a new chunk. However,
a pathological inflate type of transform can cause excessive buffering
here. For example, imagine a stream where every byte of input is
interpreted as an integer from 0-255, and then results in that many
bytes of output. Writing the 4 bytes {ff,ff,ff,ff} would result in
1kb of data being output. In this case, you could write a very small
amount of input, and end up with a very large amount of output. In
such a pathological inflating mechanism, there'd be no way to tell
the system to stop doing the transform. A single 4MB write could
cause the system to run out of memory.
So TL;DR: How do I insert (large) streams at a specific index, without having a huge back pressure of buffers in memory. Any advice is appreciated.
After a lot of reading the documentation and the source code, a lot of trial and error and some testing. I have come up with a solution for my problem. I can just copy and paste my solution, but for the sake of completeness I will explain my findings here.
Handling the back pressure with pipes consists out of a few parts. We've got the Readable that writes data to the Writable. The Readable provides a callback for the Writable with which it can tell the Readable it is ready to receive a new chunk of data. The reading part is simpler. The Readable has an internal buffer. Using Readable.push() will add data to the buffer. When the data is being read, it will come from this internal buffer. Next to that we can use Readable.readableHighWaterMark and Readable.readableLength to make sure we don't push to much data at once.
Readable.readableHighWaterMark - Readable.readableLength
is the maximum amount of bytes we should push to this internal buffer.
So this means, since we want to read from two Readable streams at the same time we need two Writable streams to control the flow. To merge data we will need to buffer it ourselves, since there is (as far as I know) no internal buffer in the Writable stream. So the Duplex stream will be the best option, because we want to handle buffering, writing and reading our selves.
Writing
So let's get to the code now. To control the state of multiple streams we will create a state interface. which looks as follows:
declare type StreamCallback = (error?: Error | null) => void;
interface MergingState {
callback: StreamCallback;
queue: BufferList;
highWaterMark: number;
size: number;
finalizing: boolean;
}
The callback holds the last callback provided by either write or final (we'll get to final later). highWaterMark indicates the maximum size for the our queue and the size is our current size of the queue. Lastly the finalizing flag indicates that the current queue is the last queue. So once the queue is empty we're done reading the stream belonging to that state.
BufferList is a copy of the internal Nodejs implementation used for the build in streams.
As mentioned before the writable handles the back pressure, so the generalized method for both our writables looks like the following:
/**
* Method to write to provided state if it can
*
* (Will unshift the bytes that cannot be written back to the source)
*
* #param src the readable source that writes the chunk
* #param chunk the chunk to be written
* #param encoding the chunk encoding, currently not used
* #param cb the streamCallback provided by the writing state
* #param state the state which should be written to
*/
private writeState(src: Readable, chunk: Buffer, encoding: string, cb: StreamCallback, state: MergingState): void {
this.mergeNextTick();
const bytesAvailable = state.highWaterMark - state.size;
if (chunk.length <= bytesAvailable) {
// save to write to our local buffer
state.queue.push(chunk);
state.size += chunk.length;
if (chunk.length === bytesAvailable) {
// our queue is full, so store our callback
this.stateCallbackAndSet(state, cb);
} else {
// we still have some space, so we can call the callback immediately
cb();
}
return;
}
if (bytesAvailable === 0) {
// no space available unshift entire chunk
src.unshift(chunk);
} else {
state.size += bytesAvailable;
const leftOver = Buffer.alloc(chunk.length - bytesAvailable);
chunk.copy(leftOver, 0, bytesAvailable);
// push amount of bytes available
state.queue.push(chunk.slice(0, bytesAvailable));
// unshift what we cannot fit in our queue
src.unshift(leftOver);
}
this.stateCallbackAndSet(state, cb);
}
First we check how much space is available to buffer. If there is enough space for our full chunk, we'll buffer it. If there is no space available, we will unshift the buffer to its readable source. If there is some space available, we'll only unshift what we cannot fit. If our buffer is full, we will store the callback that requests a new chunk. If there is space we will request our next chunk.
this.mergeNextTick() is called because our state has changed and that it should be read in the next tick:
private mergeNextTick(): void {
if (!this.mergeSync) {
// make sure it is only called once per tick
// we don't want to call it multiple times
// since there will be nothing left to read the second time
this.mergeSync = true;
process.nextTick(() => this._read(this.readableHighWaterMark));
}
}
this.stateCallbackAndSet is a helper function that will just call our last callback to make sure we'll not get in a state that makes our stream stop flowing. And will the new one provided.
/**
* Helper function to call the callback if it exists and set the new callback
* #param state the state which holds the callback
* #param cb the new callback to be set
*/
private stateCallbackAndSet(state: MergingState, cb: StreamCallback): void {
if (!state) {
return;
}
if (state.callback) {
const callback = state.callback;
// do callback next tick, such that we can't get stuck in a writing loop
process.nextTick(() => callback());
}
state.callback = cb;
}
Reading
Now onto the reading side this is the part where we handle selecting the correct stream.
First our function to read the state, which is pretty straight forward. it reads the amount of bytes it is able to read. It returns the amount of bytes written, which is useful information for our other function.
/**
* Method to read the provided state if it can
*
* #param size the number of bytes to consume
* #param state the state from which needs to be read
* #returns the amount of bytes read
*/
private readState(size: number, state: MergingState): number {
if (state.size === 0) {
// our queue is empty so we read 0 bytes
return 0;
}
let buffer = null;
if (state.size < size) {
buffer = state.queue.consume(state.size, false);
} else {
buffer = state.queue.consume(size, false);
}
this.push(buffer);
this.stateCallbackAndSet(state, null);
state.size -= buffer.length;
return buffer.length;
}
The doRead method is where all the merging takes place: it fetches the nextMergingIndex. If the merging index is the END then we can just read the writingState until the end of the stream. If we are at the merging index, we read from the mergingState. Otherwise we read as much from the writingState until we reach the next merging index.
/**
* Method to read from the correct Queue
*
* The doRead method is called multiple times by the _read method until
* it is satisfied with the returned size, or until no more bytes can be read
*
* #param n the number of bytes that can be read until highWaterMark is hit
* #throws Errors when something goes wrong, so wrap this method in a try catch.
* #returns the number of bytes read from either buffer
*/
private doRead(n: number): number {
// first check all constants below 0,
// which is only Merge.END right now
const nextMergingIndex = this.getNextMergingIndex();
if (nextMergingIndex === Merge.END) {
// read writing state until the end
return this.readWritingState(n);
}
const bytesToNextIndex = nextMergingIndex - this.index;
if (bytesToNextIndex === 0) {
// We are at the merging index, thus should read merging queue
return this.readState(n, this.mergingState);
}
if (n <= bytesToNextIndex) {
// We are safe to read n bytes
return this.readWritingState(n);
}
// read the bytes until the next merging index
return this.readWritingState(bytesToNextIndex);
}
readWritingState reads the state and updates the index:
/**
* Method to read from the writing state
*
* #param n maximum number of bytes to be read
* #returns number of bytes written.
*/
private readWritingState(n: number): number {
const bytesWritten = this.readState(n, this.writingState);
this.index += bytesWritten;
return bytesWritten;
}
Merging
For selecting our streams to merge we'll use a generator function. The generator function yields an index and a stream to merge at that index:
export interface MergingStream { index: number; stream: Readable; }
In doRead getNextMergingIndex() is called. This function returns the index of the next MergingStream. If there is no next mergingStream the generator is called to fetch a new mergingStream. If there is no new merging stream, we'll just return END.
/**
* Method to get the next merging index.
*
* Also fetches the next merging stream if merging stream is null
*
* #returns the next merging index, or Merge.END if there is no new mergingStream
* #throws Error when invalid MergingStream is returned by streamGenerator
*/
private getNextMergingIndex(): number {
if (!this.mergingStream) {
this.setNewMergeStream(this.streamGenerator.next().value);
if (!this.mergingStream) {
return Merge.END;
}
}
return this.mergingStream.index;
}
In the setNewMergeStream we are creating a new Writable which we can pipe our new merging stream into. For our Writable We will need to handle the write callback for writing to our state and the final callback to handle the last chunk. We should also not forget to reset our state.
/**
* Method to set the new merging stream
*
* #throws Error when mergingStream has an index less than the current index
*/
private setNewMergeStream(mergingStream?: MergingStream): void {
if (this.mergingStream) {
throw new Error('There already is a merging stream');
}
// Set a new merging stream
this.mergingStream = mergingStream;
if (mergingStream == null || mergingStream.index === Merge.END) {
// set new state
this.mergingState = newMergingState(this.writableHighWaterMark);
// We're done, for now...
// mergingStream will be handled further once nextMainStream() is called
return;
}
if (mergingStream.index < this.index) {
throw new Error('Cannot merge at ' + mergingStream.index + ' because current index is ' + this.index);
}
// Create a new writable our new mergingStream can write to
this.mergeWriteStream = new Writable({
// Create a write callback for our new mergingStream
write: (chunk, encoding, cb) => this.writeMerge(mergingStream.stream, chunk, encoding, cb),
final: (cb: StreamCallback) => {
this.onMergeEnd(mergingStream.stream, cb);
},
});
// Create a new mergingState for our new merging stream
this.mergingState = newMergingState(this.mergeWriteStream.writableHighWaterMark);
// Pipe our new merging stream to our sink
mergingStream.stream.pipe(this.mergeWriteStream);
}
Finalizing
The last step in the process is to handle our final chunks. Such that we know when to end merging and can send an end chunk. In our main read loop we first read until our doRead() method returns 0 twice in a row, or has filled our read buffer. Once that happens we end our read loop and check our states to see if they have finished.
public _read(size: number): void {
if (this.finished) {
// we've finished, there is nothing to left to read
return;
}
this.mergeSync = false;
let bytesRead = 0;
do {
const availableSpace = this.readableHighWaterMark - this.readableLength;
bytesRead = 0;
READ_LOOP: while (bytesRead < availableSpace && !this.finished) {
try {
const result = this.doRead(availableSpace - bytesRead);
if (result === 0) {
// either there is nothing in our buffers
// or our states are outdated (since they get updated in doRead)
break READ_LOOP;
}
bytesRead += result;
} catch (error) {
this.emit('error', error);
this.push(null);
this.finished = true;
}
}
} while (bytesRead > 0 && !this.finished);
this.handleFinished();
}
Then in our handleFinished() we check our states.
private handleFinished(): void {
if (this.finished) {
// merge stream has finished, so nothing to check
return;
}
if (this.isStateFinished(this.mergingState)) {
this.stateCallbackAndSet(this.mergingState, null);
// set our mergingStream to null, to indicate we need a new one
// which will be fetched by getNextMergingIndex()
this.mergingStream = null;
this.mergeNextTick();
}
if (this.isStateFinished(this.writingState)) {
this.stateCallbackAndSet(this.writingState, null);
this.handleMainFinish(); // checks if there are still mergingStreams left, and sets finished flag
this.mergeNextTick();
}
}
The isStateFinished() checks if our state has the finalizing flag set and if the queue size equals 0
/**
* Method to check if a specific state has completed
* #param state the state to check
* #returns true if the state has completed
*/
private isStateFinished(state: MergingState): boolean {
if (!state || !state.finalizing || state.size > 0) {
return false;
}
return true;
}
The finalized flag is set once our end callback is in the final callback for our merging Writable stream. For our main stream we have to approach it a little differently, since we have little control over when our stream ends, because the readable calls the end of our writable by default. We want to remove this behavior such that we can decide when we finish our stream. This might cause some issues when other end listeners are set, but for most use cases this should be fine.
private onPipe(readable: Readable): void {
// prevent our stream from being closed prematurely and unpipe it instead
readable.removeAllListeners('end'); // Note: will cause issues if another end listener is set
readable.once('end', () => {
this.finalizeState(this.writingState);
readable.unpipe();
});
}
The finalizeState() sets the flag and the callback to end the stream.
/**
* Method to put a state in finalizing mode
*
* Finalizing mode: the last chunk has been received, when size is 0
* the stream should be removed.
*
* #param state the state which should be put in finalizing mode
*
*/
private finalizeState(state: MergingState, cb?: StreamCallback): void {
state.finalizing = true;
this.stateCallbackAndSet(state, cb);
this.mergeNextTick();
}
And that is how you merge multiple streams in one single sink.
TL;DR: The complete code
This code has been fully tested with my jest test suite on multiple edge cases And has a few more features than explained in my code. Such as appending streams and merging into that appended stream. By providing Merge.END as index.
Test result
You can see the tests I have ran here, if I forgot any, send me a message and I may write another test for it
MergeStream
✓ should throw an error when nextStream is not implemented (9ms)
✓ should throw an error when nextStream returns a stream with lower index (4ms)
✓ should reset index after new main stream (5ms)
✓ should write a single stream normally (50ms)
✓ should be able to merge a stream (2ms)
✓ should be able to append a stream on the end (1ms)
✓ should be able to merge large streams into a smaller stream (396ms)
✓ should be able to merge at the correct index (2ms)
Usage
const mergingStream = new Merge({
*nextStream(): IterableIterator<MergingStream> {
for (let i = 0; i < 10; i++) {
const stream = new Readable();
stream.push(i.toString());
stream.push(null);
yield {index: i * 2, stream};
}
},
});
const template = new Readable();
template.push(', , , , , , , , , ');
template.push(null);
template.pipe(mergingStream).pipe(getSink());
The result will of our sink would be
0, 1, 2, 3, 4, 5, 6, 7, 8, 9
Final Thoughts
This is not the most time efficient way of doing it, since we only manage one merging buffer at once. So there is a lot of waiting. For my use case that is fine. I care about it not eating up my memory and this solution works for me. But there is definitely some space for optimization. The complete code has some extra features that are not fully explained here, such as appending streams and merging into that appended stream. They have been explained with comments though.

Nodejs: How can I optimize writing many files?

I'm working in a Node environment on Windows. My code is receiving 30 Buffer objects (~500-900kb each) each second, and I need to save this data to the file system as quickly as possible, without engaging in any work that blocks the receipt of the following Buffer (i.e. the goal is to save the data from every buffer, for ~30-45 minutes). For what it's worth, the data is sequential depth frames from a Kinect sensor.
My question is: What is the most performant way to write files in Node?
Here's pseudocode:
let num = 0
async function writeFile(filename, data) {
fs.writeFileSync(filename, data)
}
// This fires 30 times/sec and runs for 30-45 min
dataSender.on('gotData', function(data){
let filename = 'file-' + num++
// Do anything with data here to optimize write?
writeFile(filename, data)
}
fs.writeFileSync seems much faster than fs.writeFile, which is why I'm using that above. But are there any other ways to operate on the data or write to file that could speed up each save?
First off, you never want to use fs.writefileSync() in handling real-time requests because that blocks the entire node.js event loop until the file write is done.
OK, based on writing each block of data to a different file, then you want to allow multiple disk writes to be in process at the same time, but not unlimited disk writes. So, it's still appropriate to use a queue, but this time the queue doesn't just have one write in process at a time, it has some number of writes in process at the same time:
const EventEmitter = require('events');
class Queue extends EventEmitter {
constructor(basePath, baseIndex, concurrent = 5) {
this.q = [];
this.paused = false;
this.inFlightCntr = 0;
this.fileCntr = baseIndex;
this.maxConcurrent = concurrent;
}
// add item to the queue and write (if not already writing)
add(data) {
this.q.push(data);
write();
}
// write next block from the queue (if not already writing)
write() {
while (!paused && this.q.length && this.inFlightCntr < this.maxConcurrent) {
this.inFlightCntr++;
let buf = this.q.shift();
try {
fs.writeFile(basePath + this.fileCntr++, buf, err => {
this.inFlightCntr--;
if (err) {
this.err(err);
} else {
// write more data
this.write();
}
});
} catch(e) {
this.err(e);
}
}
}
err(e) {
this.pause();
this.emit('error', e)
}
pause() {
this.paused = true;
}
resume() {
this.paused = false;
this.write();
}
}
let q = new Queue("file-", 0, 5);
// This fires 30 times/sec and runs for 30-45 min
dataSender.on('gotData', function(data){
q.add(data);
}
q.on('error', function(e) {
// go some sort of write error here
console.log(e);
});
Things to consider:
Experiment with the concurrent value you pass to the Queue constructor. Start with a value of 5. Then see if raising that value any higher gives you better or worse performance. The node.js file I/O subsystem uses a thread pool to implement asynchronous disk writes so there is a max number of concurrent writes that will allow so cranking the concurrent number up really high probably does not make things go faster.
You can experiement with increasing the size of the disk I/O thread pool by setting the UV_THREADPOOL_SIZE environment variable before you start your node.js app.
Your biggest friend here is disk write speed. So, make sure you have a fast disk with a good disk controller. A fast SSD on a fast bus would be best.
If you can spread the writes out across multiple actual physical disks, that will likely also increase write throughput (more disk heads at work).
This is a prior answer based on the initial interpretation of the question (before editing that changed it).
Since it appears you need to do your disk writes in order (all to the same file), then I'd suggest that you either use a write stream and let the stream object serialize and cache the data for you or you can create a queue yourself like this:
const EventEmitter = require('events');
class Queue extends EventEmitter {
// takes an already opened file handle
constructor(fileHandle) {
this.f = fileHandle;
this.q = [];
this.nowWriting = false;
this.paused = false;
}
// add item to the queue and write (if not already writing)
add(data) {
this.q.push(data);
write();
}
// write next block from the queue (if not already writing)
write() {
if (!nowWriting && !paused && this.q.length) {
this.nowWriting = true;
let buf = this.q.shift();
fs.write(this.f, buf, (err, bytesWritten) => {
this.nowWriting = false;
if (err) {
this.pause();
this.emit('error', err);
} else {
// write next block
this.write();
}
});
}
}
pause() {
this.paused = true;
}
resume() {
this.paused = false;
this.write();
}
}
// pass an already opened file handle
let q = new Queue(fileHandle);
// This fires 30 times/sec and runs for 30-45 min
dataSender.on('gotData', function(data){
q.add(data);
}
q.on('error', function(err) {
// got disk write error here
});
You could use a writeStream instead of this custom Queue class, but the problem with that is that the writeStream may fill up and then you'd have to have a separate buffer as a place to put the data anyway. Using your own custom queue like above takes care of both issues at once.
Other Scalability/Performance Comments
Because you appear to be writing the data serially to the same file, your disk writing won't benefit from clustering or running multiple operations in parallel because they basically have to be serialized.
If your node.js server has other things to do besides just doing these writes, there might be a slight advantage (would have to be verified with testing) to creating a second node.js process and doing all the disk writing in that other process. Your main node.js process would receive the data and then pass it to the child process that would maintain the queue and do the writing.
Another thing you could experiment with is coalescing writes. When you have more than one item in the queue, you could combine them together into a single write. If the writes are already sizable, this probably doesn't make much difference, but if the writes were small this could make a big difference (combining lots of small disk writes into one larger write is usually more efficient).
Your biggest friend here is disk write speed. So, make sure you have a fast disk with a good disk controller. A fast SSD would be best.
I have written a service that does this extensively and the best thing you can do is to pipe the input data directly to the file (if you have an input stream as well).
A simple example where you download a file in such a way:
const http = require('http')
const ostream = fs.createWriteStream('./output')
http.get('http://nodejs.org/dist/index.json', (res) => {
res.pipe(ostream)
})
.on('error', (e) => {
console.error(`Got error: ${e.message}`);
})
So in this example there is no intermediate copying involved of the whole file. As the file is read in chunks from the remote http server it is written to the file on disk. This is much more efficient that downloading a whole file from the server, saving that in memory and then writing it to a file on disk.
Streams are a basis of many operations in Node.js so you should study those as well.
One other thing that you should investigate depending on your scenarios is UV_THREADPOOL_SIZE as I/O operations use libuv thread pool that is by default set to 4 and you might fill that up if you do a lot of writing.

How to get a unified onFinish from separate streams (some created from within original stream)

I have a stream process like this:
Incomming file via HTTP (original stream)
-> Check if zipfile
- Yes -> push through an unzip2-stream
- No -> push to S3
When the unzip2-stream finds zip-entries, these are pushed through the same chain of streams, i.e.
Incomming file entry from zip file ("child" stream)
-> Check if zipfile
- Yes -> push through an unzip2-stream
- No -> push to S3
Thanks to https://stackoverflow.com/users/3580261/eljefedelrodeodeljefe I managed to solve the main problem after this conversation:
How to redirect a stream to other stream depending on data in first chunk?
The problem with creating new "child" streams for every zip entry is that these will have no connection to the original stream, so I cannot get a unified onFinish for all the streams.
I don't want to send a 202 of to the sender before I have processed (unzipped and sent to S3) every file. How can I accomplish this?
I'm thinking that I might need some kind of control object which awaits onFinish for all child streams and forces the process to dwell in the original onFinish event until all files are processed. Would this be overkill? Is there a simpler solution?
I ended up making a separate counter for the streams. There is probably a better solution, but this works.
I send the counter object as an argument to the first call to my saveFile() function. The counter is passed along to the unzip stream so it can be passed to saveFile for every file entry.
Just before a stream is started (i.e. piped) I call streamCounter.streamStarted().
In the last onFinish in the pipe chain I call streamCounter.streamFinished()
In the event of a stream going bad I call streamCounter.streamFailed()
Just before I send the 202 in the form post route I wait for streamCounter.streamPromise to resolve.
I'm not very proud of the setInterval solution. It'd probably be better with some kind of event emitting.
module.exports.streamCounter = function() {
let streamCount = 0;
let isStarted = false;
let errors = [];
this.streamStarted = function(options) {
isStarted = true;
streamCount += 1;
log.debug(`Stream started for ${options.filename}. New streamCount: ${streamCount}`);
};
this.streamFinished = function(options) {
streamCount -= 1;
log.debug(`Finished stream for ${options.filename}. New streamCount: ${streamCount}`);
};
this.streamFailed = function(err) {
streamCount -= 1;
errors.push(err);
log.debug(`Failed stream because (${err.message}). New streamCount: ${streamCount}`);
};
this.streamPromise = new Promise(function(resolve, reject) {
let interval = setInterval(function() {
if(isStarted && streamCount === 0) {
clearInterval(interval);
if(errors.length === 0) {
log.debug('StreamCounter back on 0. Resolving streamPromise');
resolve();
} else {
log.debug('StreamCounter back on 0. Errors encountered.. Rejecting streamPromise');
reject(errors[errors.length-1]);
}
}
}, 100);
});
};
At first I tried this concept with a promise array and waited for Promise.all() before sending status 202. But Promise.all() only works with static arrays as far as I can tell. My "streamCount" is changing during the streaming so I needed a more dynamic "Promise.all".

Is http.ServerResponse.write() blocking?

Is it possible to write non-blocking response.write? I've written a simple test to see if other clients can connect while one downloads a file:
var connect = require('connect');
var longString = 'a';
for (var i = 0; i < 29; i++) { // 512 MiB
longString += longString;
}
console.log(longString.length)
function download(request, response) {
response.setHeader("Content-Length", longString.length);
response.setHeader("Content-Type", "application/force-download");
response.setHeader("Content-Disposition", 'attachment; filename="file"');
response.write(longString);
response.end();
}
var app = connect().use(download);
connect.createServer(app).listen(80);
And it seems like write is blocking!
Am I doing something wrong?
Update So, it doesn't block and it blocks in the same time. It doesn't block in the sense that two files can be downloaded simultaneously. And it blocks in the sense that creating a buffer is a long operation.
Any processing done strictly in JavaScript will block. response.write(), at least as of v0.8, is no exception to this:
The first time response.write() is called, it will send the buffered header information and the first body to the client. The second time response.write() is called, Node assumes you're going to be streaming data, and sends that separately. That is, the response is buffered up to the first chunk of body.
Returns true if the entire data was flushed successfully to the kernel buffer. Returns false if all or part of the data was queued in user memory. 'drain' will be emitted when the buffer is again free.
What may save some time is to convert longString to Buffer before attempting to write() it, since the conversion will occur anyways:
var longString = 'a';
for (...) { ... }
longString = new Buffer(longString);
But, it would probably be better to stream the various chunks of longString rather than all-at-once (Note: Streams are changing in v0.10):
var longString = 'a',
chunkCount = Math.pow(2, 29),
bufferSize = Buffer.byteLength(longString),
longBuffer = new Buffer(longString);
function download(request, response) {
var current = 0;
response.setHeader("Content-Length", bufferSize * chunkCount);
response.setHeader("Content-Type", "application/force-download");
response.setHeader("Content-Disposition", 'attachment; filename="file"');
function writeChunk() {
if (current < chunkCount) {
current++;
if (response.write(longBuffer)) {
process.nextTick(writeChunk);
} else {
response.once('drain', writeChunk);
}
} else {
response.end();
}
}
writeChunk();
}
And, if the eventual goal is to stream a file from disk, this can be even easier with fs.createReadStream() and stream.pipe():
function download(request, response) {
// response.setHeader(...)
// ...
fs.createReadStream('./file-on-disk').pipe(response);
}
Nope, it does not block, I tried one from IE and other from firefox. I did IE first but still could download file from firefox first.
I tried for 1 MB (i < 20) it works the same just faster.
You should know that whatever longString you create requires memory allocation. Try to do it for i < 30 (on windows 7) and it will throw FATAL ERROR: JS Allocation failed - process out of memory.
It takes time for memory allocation/copying nothing else. Since it is a huge file, the response is time taking and your download looks like blocking. Try it yourself for smaller values (i < 20 or something)

Resources