Memory consideration using buffer and buffer.toString() in node.js streams - node.js

I am performing readstream.pipe(transformstream).pipe(writeStream). The application is read xml from a file system using readstream --> do some manipulation based on different tags using transform stream --> write it to another file system.
During the transform operation inside the transformstream. I am doing buffer.toString() for every chunk as I need to manipulate string before pushing them:
myTransformStream._transform = function(chunk, encoding, done) {
var stringTobeManipulated = chunk.toString();
// perform changes on stringTobeManipulated and push it to writestream
}
the size of xmls can be upto 3 mb and I noticed I got divided to ~44 chunks.
Based on the given problem I have questions concerning memory consumption both in v8 heap and system:
1) I understand that buffers are stored outside the v8 memory. When I do a chunk.toString() in my _transform function, does it create a javascript string object inside the v8 memory ? If yes, I am assuming it will be garbage collected after it looses all its references.
2) As the buffers are part of the system memory I believe they are not garbage collected, so when is that memory freed up ?
3) Is the application a good use case of transform stream as I am converting every chunk to string ?
EDITED:
May be I am not explaining myself clearly. Anyways I have been trying to find a ay of removing namespace from the xml tag before converting it to json. Here is the code I ended up with. Took advantage of memoization in javascrtipt. Please let me know if you guys find a better or more memory efficient way. We are using actionhero.js framework
var action = {};
var xml2js = require('xml2js');
var fs = require('fs');
/////////////////////////////////////////////////////////////////////
// metadata
action.name = 'removeNamespace';
action.description = 'I will parse the article related xml and yield the result';
action.inputs = {
'required': [],
'optional': []
};
action.blockedConnectionTypes = [];
action.outputExample = {
articleHeading: 'heading',
articleBody: 'body'
};
/////////////////////////////////////////////////////////////////////
// functional
action.run = function(api, connection, next) {
var stream = require('stream');
var removeNamespace = new stream.Transform();
var util = require('util');
var output = fs.createWriteStream('output.xml');
removeNamespace._transform = function(chunk, encoding, done) {
removeNamespace._transform.brokenTag = removeNamespace._transform.brokenTag || false;
removeNamespace._transform.brokenString = removeNamespace._transform.brokenString || '';
var convertedString = chunk.toString().trim();
if (removeNamespace._transform.brokenTag === true){
convertedString = removeNamespace._transform.brokenString + convertedString ;
}
removeNamespace._transform.brokenTag = false;
removeNamespace._transform.brokenString = '' ;
if (convertedString.lastIndexOf('<') > convertedString.lastIndexOf('>') ){
removeNamespace._transform.brokenString =convertedString.substring(convertedString.lastIndexOf('<'),convertedString.length+1);
convertedString = convertedString.substring(0,convertedString.lastIndexOf('<')) ;
removeNamespace._transform.brokenTag = true ;
}
convertedString = convertedString.replace(/>\s+</g, '><')
.replace.replace(/<[A-Za-z]+:/gi, '<')
.replace(/<\/[A-Za-z]+:/gi, '</');
done(null, convertedString);
};
var Writable = stream.Writable;
function xmlInMemory(keyToXml, options) {
Writable.call(this, options);
this[keyToXml] = new Buffer('');
this._write = function(chunk, encoding, callback) {
chunk = (Buffer.isBuffer(chunk)) ? chunk : new Buffer(chunk, encoding);
this[keyToXml] = Buffer.concat([this[keyToXml], chunk]);
callback();
};
}
util.inherits(xmlInMemory, Writable);
var source = fs.createReadStream('path/to/your/xmlfile');
var target = new xmlInMemory('keyToXml');
source.pipe(removeNamespace).pipe(target);
target.on('finish', function() {
var parser = new xml2js.Parser();
connection.response.xml2js = target.keyToXml.toString();
next(connection, true);
parser.parseString(target.keyToXml.toString(), function(err, result) {
connection.response.xml2js = result;
next(connection, true);
});
});
};
/////////////////////////////////////////////////////////////////////
// exports
exports.action = action;

Related

wait for previous stream to be empty before allowing reading

Say I have a file that contains a list of integers, one per line. I use fs.createReadStream and pipe that into split (so that each chunk is an integer). Then I pipe that into a duplex stream that is supposed to add the numbers and write the sum by piping into fs.createWriteStream.
var fs = require('fs');
var stream = require('stream');
var split = require('split');
var addIntegers = new stream.Duplex();
addIntegers.sum = 0;
addIntegers._read = function(size) {
this.push(this.sum + '\n');
}
addIntegers._write = function(chunk, encoding, done) {
this.sum += +chunk;
done();
}
fs.createReadStream('list-of-integers.txt')
.pipe(split())
.pipe(addIntegers)
.pipe(fs.createWriteStream('sum.txt'));
When I run this, sum.txt just gets continually filled with zeroes and the program never terminates (as expected). How do I wait for the input stream (split) to be empty before allowing the ouput stream (fs.createWriteStream) to read from addIntegers?
I figured it out.
I decided to use a Transform stream instead (thanks mscdex) because it has a method (_flush) that gets called after all written data is consumed. The working code is below. Don't forget to npm i split :)
var fs = require('fs');
var stream = require('stream');
var split = require('split');
var addIntegers = new stream.Transform();
addIntegers.sum = 0;
addIntegers._transform = function(chunk, encoding, done) {
this.sum += +chunk;
done();
}
addIntegers._flush = function(done) {
this.push(this.sum + '\n');
}
fs.createReadStream('list-of-integers.txt')
.pipe(split())
.pipe(addIntegers)
.pipe(fs.createWriteStream('sum.txt'));

Reasons why a node stream read() function might return null?

I am building a node application that reads a CSV file from the file system, analyzes the file, and then parses the file using the csv-parse module. Each of these steps are in the form of a stream, piped one into the next.
The trouble I am having is that for some files, the parse step can read the stream, but for others, the read() method returns null on the readable event and I don't know why.
In the code below, specifically, I will sometimes see data come through on calling read() on the parser stream, and other times it will return null. CSV files that succeed always succeed, and CSV files that fail always fail. I tried to determine some difference between the files, but other than using different field names in the first row, and slightly different data in the body, I can't see any significant difference between the source files.
What are some reasons that a node stream's read() function might return null after a readable event?
Sample code:
var parse = require('csv-parse');
var fs = require('fs');
var stream = require('stream');
var byteCounter = new stream.Transform({objectMode : true});
byteCounter.setEncoding('utf8');
var totalBytes = 0;
// count the bytes we have read
byteCounter._transform = function (chunk, encoding, done) {
var data = chunk.toString();
if (this._lastLineData) {
data = this._lastLineData + data ;
}
var lines = data.split('\n');
// this is because each chunk will probably not end precisely at the end of a line:
this._lastLineData = lines.splice(lines.length-1,1)[0];
lines.forEach(function(line) {
totalBytes += line.length + 1 // we add an extra byte for the end-of-line
this.push(line);
}, this);
done();
};
byteCounter._flush = function (done) {
if (this._lastLineData) {
this.push(this._lastLineData);
}
this._lastLineData = null;
done();
};
// csv parser
var parser = parse({
delimiter: ",",
comment: "#",
skip_empty_lines: true,
auto_parse: true,
columns: true
});
parser.on('readable', function(){
var row;
while( null !== (row = parser.read()) ) {
// do stuff
}
});
// start by reading a file, piping to byteCounter, then pipe to parser.
var myPath = "/path/to/file.csv";
var options = {
encoding : "utf-8"
};
fs.createReadStream(myPath, options).pipe(byteCounter).pipe(parser);

Is it possible to register multiple listeners to a child process's stdout data event? [duplicate]

I need to run two commands in series that need to read data from the same stream.
After piping a stream into another the buffer is emptied so i can't read data from that stream again so this doesn't work:
var spawn = require('child_process').spawn;
var fs = require('fs');
var request = require('request');
var inputStream = request('http://placehold.it/640x360');
var identify = spawn('identify',['-']);
inputStream.pipe(identify.stdin);
var chunks = [];
identify.stdout.on('data',function(chunk) {
chunks.push(chunk);
});
identify.stdout.on('end',function() {
var size = getSize(Buffer.concat(chunks)); //width
var convert = spawn('convert',['-','-scale',size * 0.5,'png:-']);
inputStream.pipe(convert.stdin);
convert.stdout.pipe(fs.createWriteStream('half.png'));
});
function getSize(buffer){
return parseInt(buffer.toString().split(' ')[2].split('x')[0]);
}
Request complains about this
Error: You cannot pipe after data has been emitted from the response.
and changing the inputStream to fs.createWriteStream yields the same issue of course.
I don't want to write into a file but reuse in some way the stream that request produces (or any other for that matter).
Is there a way to reuse a readable stream once it finishes piping?
What would be the best way to accomplish something like the above example?
You have to create duplicate of the stream by piping it to two streams. You can create a simple stream with a PassThrough stream, it simply passes the input to the output.
const spawn = require('child_process').spawn;
const PassThrough = require('stream').PassThrough;
const a = spawn('echo', ['hi user']);
const b = new PassThrough();
const c = new PassThrough();
a.stdout.pipe(b);
a.stdout.pipe(c);
let count = 0;
b.on('data', function (chunk) {
count += chunk.length;
});
b.on('end', function () {
console.log(count);
c.pipe(process.stdout);
});
Output:
8
hi user
The first answer only works if streams take roughly the same amount of time to process data. If one takes significantly longer, the faster one will request new data, consequently overwriting the data still being used by the slower one (I had this problem after trying to solve it using a duplicate stream).
The following pattern worked very well for me. It uses a library based on Stream2 streams, Streamz, and Promises to synchronize async streams via a callback. Using the familiar example from the first answer:
spawn = require('child_process').spawn;
pass = require('stream').PassThrough;
streamz = require('streamz').PassThrough;
var Promise = require('bluebird');
a = spawn('echo', ['hi user']);
b = new pass;
c = new pass;
a.stdout.pipe(streamz(combineStreamOperations));
function combineStreamOperations(data, next){
Promise.join(b, c, function(b, c){ //perform n operations on the same data
next(); //request more
}
count = 0;
b.on('data', function(chunk) { count += chunk.length; });
b.on('end', function() { console.log(count); c.pipe(process.stdout); });
You can use this small npm package I created:
readable-stream-clone
With this you can reuse readable streams as many times as you need
For general problem, the following code works fine
var PassThrough = require('stream').PassThrough
a=PassThrough()
b1=PassThrough()
b2=PassThrough()
a.pipe(b1)
a.pipe(b2)
b1.on('data', function(data) {
console.log('b1:', data.toString())
})
b2.on('data', function(data) {
console.log('b2:', data.toString())
})
a.write('text')
I have a different solution to write to two streams simultaneously, naturally, the time to write will be the addition of the two times, but I use it to respond to a download request, where I want to keep a copy of the downloaded file on my server (actually I use a S3 backup, so I cache the most used files locally to avoid multiple file transfers)
/**
* A utility class made to write to a file while answering a file download request
*/
class TwoOutputStreams {
constructor(streamOne, streamTwo) {
this.streamOne = streamOne
this.streamTwo = streamTwo
}
setHeader(header, value) {
if (this.streamOne.setHeader)
this.streamOne.setHeader(header, value)
if (this.streamTwo.setHeader)
this.streamTwo.setHeader(header, value)
}
write(chunk) {
this.streamOne.write(chunk)
this.streamTwo.write(chunk)
}
end() {
this.streamOne.end()
this.streamTwo.end()
}
}
You can then use this as a regular OutputStream
const twoStreamsOut = new TwoOutputStreams(fileOut, responseStream)
and pass it to to your method as if it was a response or a fileOutputStream
If you have async operations on the PassThrough streams, the answers posted here won't work.
A solution that works for async operations includes buffering the stream content and then creating streams from the buffered result.
To buffer the result you can use concat-stream
const Promise = require('bluebird');
const concat = require('concat-stream');
const getBuffer = function(stream){
return new Promise(function(resolve, reject){
var gotBuffer = function(buffer){
resolve(buffer);
}
var concatStream = concat(gotBuffer);
stream.on('error', reject);
stream.pipe(concatStream);
});
}
To create streams from the buffer you can use:
const { Readable } = require('stream');
const getBufferStream = function(buffer){
const stream = new Readable();
stream.push(buffer);
stream.push(null);
return Promise.resolve(stream);
}
What about piping into two or more streams not at the same time ?
For example :
var PassThrough = require('stream').PassThrough;
var mybiraryStream = stream.start(); //never ending audio stream
var file1 = fs.createWriteStream('file1.wav',{encoding:'binary'})
var file2 = fs.createWriteStream('file2.wav',{encoding:'binary'})
var mypass = PassThrough
mybinaryStream.pipe(mypass)
mypass.pipe(file1)
setTimeout(function(){
mypass.pipe(file2);
},2000)
The above code does not produce any errors but the file2 is empty

Nodejs: Transport stream is not pumping data out completely?

I was trying to learn the streaming in Nodejs by writing a small script. But after executing this one the last stream is not pushing all the data.
var stream = require('stream');
var fs = require('fs');
var util = require('util');
function Newliner () {
stream.Transform.call(this);
}
util.inherits(Newliner, stream.Transform);
Newliner.prototype._transform = function(chunk, enc, done) {
var split = 0;
for( var i =0; i <chunk.length; i++){
if(chunk[i] == 10) {
this.push(chunk.slice(split,i));
split = i+1;
}
}
}
function Greper(options) {
stream.Transform.call(this);
this.regex = new RegExp(options);
}
util.inherits(Greper, stream.Transform);
Greper.prototype._transform = function(chunk, enc, done) {
this.push(chunk); //Even this is not working.
/*
var a = chunk.toString();
if(this.regex.test(a)){
this.push(chunk);
}
*/
}
var n = new Newliner();
var g = new Greper("line");
var f = fs.createReadStream('a.txt');
f.pipe(n).pipe(g).pipe(process.stdout);
Input file a.txt is,
This is line one.
Another line.
Third line.
While executing only one line is displayed. What is the reason for this?
$ node test.js
This is line one.
Note: When i am piping the File read stream directely to the 'g' it works correctly.
You need to call the callback of the _transform() function when you are done processing the chunk. From the documentation:
callback (Function) Call this function (optionally with an error argument) when you are done processing the supplied chunk.
No more data will be pushed into the stream until the callback is called. If you don't call it, then the chunk will not be considered as processed… which is why your program stops after processing only one line.
Just add:
done();
at the end of the Newliner.prototype._transform and Greper.prototype._transform functions.

Creating a Node.js stream from two piped streams

I'd like to combine two Node.js streams into one by piping them, if possible. I'm using Transform streams.
In other words, I'd like my library to return myStream for people to use. For example they could write:
process.stdin.pipe(myStream).pipe(process.stdout);
And internally I'm using a third-party vendorStream that does some work, plugged into my own logic contained in myInternalStream. So what's above would translate to:
process.stdin.pipe(vendorStream).pipe(myInternalStream).pipe(process.stdout);
Can I do something like that? I've tried var myStream = vendorStream.pipe(myInternalStream) but that obviously doesn't work.
To make an analogy with bash, let's say I want to write a program that checks if the letter h is present in the last line of some stream (tail -n 1 | grep h), I can create a shell script:
# myscript.sh
tail -n 1 | grep h
And then if people do:
$ printf "abc\ndef\nghi" | . myscript.sh
It just works.
This is what I have so far:
// Combine a pipe of two streams into one stream
var util = require('util')
, Transform = require('stream').Transform;
var chunks1 = [];
var stream1 = new Transform();
var soFar = '';
stream1._transform = function(chunk, encoding, done) {
chunks1.push(chunk.toString());
var pieces = (soFar + chunk).split('\n');
soFar = pieces.pop();
for (var i = 0; i < pieces.length; i++) {
var piece = pieces[i];
this.push(piece);
}
return done();
};
var chunks2 = [];
var count = 0;
var stream2 = new Transform();
stream2._transform = function(chunk, encoding, done) {
chunks2.push(chunk.toString());
count = count + 1;
this.push(count + ' ' + chunk.toString() + '\n');
done();
};
var stdin = process.stdin;
var stdout = process.stdout;
process.on('exit', function () {
console.error('chunks1: ' + JSON.stringify(chunks1));
console.error('chunks2: ' + JSON.stringify(chunks2));
});
process.stdout.on('error', process.exit);
// stdin.pipe(stream1).pipe(stream2).pipe(stdout);
// $ (printf "abc\nd"; sleep 1; printf "ef\nghi\n") | node streams-combine.js
// Outputs:
// 1 abc
// 2 def
// 3 ghi
// chunks1: ["abc\nd","ef\nghi\n"]
// chunks2: ["abc","def","ghi"]
// Best working solution I could find
var stream3 = function(src) {
return src.pipe(stream1).pipe(stream2);
};
stream3(stdin).pipe(stdout);
// $ (printf "abc\nd"; sleep 1; printf "ef\nghi\n") | node streams-combine.js
// Outputs:
// 1 abc
// 2 def
// 3 ghi
// chunks1: ["abc\nd","ef\nghi\n"]
// chunks2: ["abc","def","ghi"]
Is this at all possible? Let me know if what I'm trying to do isn't clear.
Thanks!
You can watch for something to be piped to your stream, and then unpipe it and pipe it to the streams you're interested in:
var PassThrough = require('stream').PassThrough;
var stream3 = new PassThrough();
// When a source stream is piped to us, undo that pipe, and save
// off the source stream piped into our internally managed streams.
stream3.on('pipe', function(source) {
source.unpipe(this);
this.transformStream = source.pipe(stream1).pipe(stream2);
});
// When we're piped to another stream, instead pipe our internal
// transform stream to that destination.
stream3.pipe = function(destination, options) {
return this.transformStream.pipe(destination, options);
};
stdin.pipe(stream3).pipe(stdout);
You can extract this functionality into your own constructable stream class:
var util = require('util');
var PassThrough = require('stream').PassThrough;
var StreamCombiner = function() {
this.streams = Array.prototype.slice.apply(arguments);
this.on('pipe', function(source) {
source.unpipe(this);
for(i in this.streams) {
source = source.pipe(this.streams[i]);
}
this.transformStream = source;
});
};
util.inherits(StreamCombiner, PassThrough);
StreamCombiner.prototype.pipe = function(dest, options) {
return this.transformStream.pipe(dest, options);
};
var stream3 = new StreamCombiner(stream1, stream2);
stdin.pipe(stream3).pipe(stdout);
One option is perhaps to use multipipe which lets you chain multiple transforms together, wrapped as a single transform stream:
// my-stream.js
var multipipe = require('multipipe');
module.exports = function createMyStream() {
return multipipe(vendorStream, myInternalStream);
};
Then you can do:
var createMyStream = require('./my-stream');
var myStream = createMyStream();
process.stdin.pipe(myStream).pipe(process.stdout);
Clarification: This makes stdin go through vendorStream, then myInternalStream and finally stdout.

Resources