Stream and transform a file in place with nodejs - node.js

I'd like to do something like:
var fs = require('fs');
var through = require('through');
var file = 'path/to/file.json';
var input = fs.createReadStream(file, 'utf8');
var output = fs.createWriteStream(file, 'utf8');
var buf = '';
input
.pipe(through(function data(chunk) { buf += chunk; }, function end() {
var data = JSON.parse(buf);
// Do some transformation on the obj, and then...
this.queue(JSON.stringify(data, null, ' '));
})
.pipe(output);
But this fails because it's trying to read and write to the same destination. There are ways around it, like only piping to output from within the end callback above.
Is there a better way? By better, I mean uses less code or less memory. And yes, I'm aware that I could just do:
var fs = require('fs');
var file = 'path/to/file.json';
var str = fs.readFileSync(file, 'utf8');
var data = JSON.parse(str);
// Do some transformation on the obj, and then...
fs.writeFileSync(file, JSON.stringify(data, null, ' '), 'utf8');

There is no other way that your code will use less memory, because you need the whole file to parse it into a Javascript object. In this way, both versions of your code are equivalent memory-wise. If you can do some work without having to work on the full JSON object, check out JSONStream.
In your example, you should read the file, then parse and transform it, then write the result to a file; although you shouldn't use the synchronous version of the functions, see the end of this paragraph of the Node.js documentation:
In busy processes, the programmer is strongly encouraged to use the asynchronous versions of these calls. The synchronous versions will block the entire process until they complete--halting all connections.
Anyway, I don't think you can read from a file while you're overwriting it. See this particular answer to the same problem.

Related

node.js: determine length of stream before it is piped to final destination

The context behind this question is that I am taking an image buffer, compressing it with pngquant, and then piping the compressed image to the response. Something like:
// https://www.npmjs.com/package/pngquant
const PngQuant = require('pngquant');
// start with base64-encoded png image data:
var base64data = '.......';
// then create buffer from this, as per:
// https://stackoverflow.com/a/28440633/4070848
// https://stackoverflow.com/a/52257416/4070848
var imgBuffer = Buffer.from(base64data, 'base64');
// set up pngquant...
const optionsArr = [ ..... ];
const myPngQuanter = new PngQuant(optionsArr);
// convert buffer into stream, as per:
// https://stackoverflow.com/a/16044400/4070848
var bufferStream = new stream.PassThrough();
bufferStream.end(imgBuffer);
// pipe the image buffer (stream) through pngquant (to compress it) and then to res...
bufferStream.pipe(myPngQuanter).pipe(res);
I want to determine the compression ratio achieved by the pngquant operation. I can easily find the starting size with:
const sizeBefore = imgBuffer.length;
I also need the size of the compressed stream. Furthermore, this information must be available before the stream is piped to the res destination because I need to add a header to res based on the compression stats.
To get sizeAfter I've tried the length-stream module, where you can insert a listener into the pipe (between myPngQuanter and res) to determine the length as it passes through. Whilst this does seem to work to determine the length of the compressed stream, it doesn't happen in time to add any headers to res. I've also tried stream-length, but cannot get it to work at all.
Any help appreciated.
Well streams by their nature don't really have length information (a stream can be infinite, e.g. opening /dev/random), so the easiest option I can see is using another temporary buffer. It is unfortunate that pngquant doesn't have options for operating on buffers, but there is not much you can do about that, besides using a different package altogether.
2nd edit, since stream-buffer might not work:
There is a package called stream-to-array, which allows easy implementation of a stream-to-buffer conversion. As per the README, the code should be modified to:
const toArray = require('stream-to-array');
const util = require('util');
toArray(bufferStream.pipe(myPngQuanter))
.then(function (parts) {
const buffers = parts
.map(part => util.isBuffer(part) ? part : Buffer.from(part));
const compressedBuffer = Buffer.concat(buffers);
console.log(compressedBuffer.length); // here is the size of the compressed data
res.write(compressedBuffer);
});
Or alternatively with await, if you happen to be in an async context:
const toArray = require('stream-to-array');
const util = require('util');
const parts = await toArray(bufferStream.pipe(myPngQuanter));
const buffers = parts.map(part => util.isBuffer(part) ? part : Buffer.from(part));
const compressedBuffer = Buffer.concat(buffers);
console.log(compressedBuffer.length); // here is the size of the compressed data
res.write(compressedBuffer);

Read chunk from Gridfs and convert to Buffer

I got a question about buffer. Here is my code:
var Grid = require('gridfs-stream');
var mongodb = require('mongodb');
var gfs = Grid(db, mongodb);
var deferred = Q.defer();
var image_buf = new Buffer('buffer');
var readableStream = gfs.createReadStream(name);
readableStream.on('data',function(chunk){
console.log(chunk);
image_buf = Buffer.concat([image_buf, chunk]);
console.log(image_buf)//differ from the chunk above
});
readableStream.on('end',function(){
db.close();
deferred.resolve(image_buf);
})
return deferred.promise;
What I'm doing is to read an image from MongoDB and put it in the gridfs-stream. I really want to retrieve all chunks in the stream and pass them to another variable so that I can reuse these chunks to draw an image in another API. Therefore I use image_buf and Buffer to perform the task. However, I get a completely different buffer string. As you can see in the above code, I consoled the chunk and the image_buf I got, but they are totally different. Can anyone tell me the reason for this and how can I correctly collect all chunks? Thanks a lot!!!
UPDATE: OK, so I figured it out now: I will append my code below for anyone who is struggling with the same problem as mine:
readableStream.on('data',function(chunk){
console.log("writing!!!");
if (!image_buf)
image_buf = chunk;
else image_buf = Buffer.concat([image_buf, chunk]);
});
The update provided by question poster does not work . So i am going to provide answer of my own. Instead of using new Buffer('buffer') it is better to use an simple array and push chunks into it and use Buffer.concat(bufferArray) at the end to get buffer of stream like this:
var readableStream = gfs.createReadStream(name);
var bufferArray = [];
readableStream.on('data',function(chunk){
bufferArray.push(chunk);
});
readableStream.on('end',function(){
var buffer = Buffer.concat(bufferArray);
deferred.resolve(buffer);
})

What are the semantics of the return value of nodejs stream.Readable.push()

I have this code:
"use strict";
var fs = require("fs");
var stream = require('readable-stream');
var Transform = require('stream').Transform,
util = require('util');
var TransformStream = function() {
Transform.call(this, {objectMode: true});
};
util.inherits(TransformStream, Transform);
TransformStream.prototype._transform = function(chunk, encoding, callback) {
if(this.push(chunk)) {
console.log("push returned true");
} else {
console.log("push returned false");
}
callback();
};
var inStream = fs.createReadStream("in.json");
var outStream = fs.createWriteStream("out.json");
var transform = new TransformStream();
inStream.pipe(transform).pipe(outStream);
in.json is 88679467 bytes in size. The first 144 writes state that push returned true. The remaining writes (1210 of them) all state that push returned false.
out.json ends up being a full copy of in.json - so no bytes were dropped.
Which leaves me with no clue about what to do with the return value of push.
What is the right thing to do?
The push() docs (https://nodejs.org/api/stream.html#stream_readable_push_chunk_encoding) say:
return Boolean Whether or not more pushes should be performed
The purpose is "backpressure" meaning when push() returns false there is probably no space left in an outbound buffer. If your stream were reading from somewhere like a file on disk, you would stop reading when push() returns false and wait to be called again. But in your case you are implementing _transform() rather than _read(), so you don't have a lot of choice in the matter--you have received a chunk and should push() it. TransformStream will buffer any excess internally, and it can take the initiative to delay future calls to your _transform() method.
So when you implement a TransformStream you can safely ignore the return value from push().
For more on this, see:
Node.js Readable Stream _read Usage
What's the proper way to handle back-pressure in a node.js Transform stream?

fs.createReadStream - limit the amount of data streamed at a time

If I only want to read 10 bytes at a time, or one line of data at a time (looking for newline characters) is it possible to pass fs.createReadStream() options like so
var options = {}
var stream = fs.createReadStream('file.txt', options);
so that I can limit the amount of data streamed at a time?
looking at the fs docs, I don't see any options that would allow me to do that even though I am guessing that it's possible.
https://nodejs.org/api/fs.html#fs_fs_createreadstream_path_options
You can use .read():
var stream = fs.createReadStream('file.txt', options);
var byteSize = 10;
stream.on("readable", function() {
var chunk;
while ( (chunk = stream.read(byteSize)) ) {
console.log(chunk.length);
}
});
The main benefit of knowing this one over just the highWaterMark option is that you can call it on streams you haven't created.
Here are the docs

Is it possible to register multiple listeners to a child process's stdout data event? [duplicate]

I need to run two commands in series that need to read data from the same stream.
After piping a stream into another the buffer is emptied so i can't read data from that stream again so this doesn't work:
var spawn = require('child_process').spawn;
var fs = require('fs');
var request = require('request');
var inputStream = request('http://placehold.it/640x360');
var identify = spawn('identify',['-']);
inputStream.pipe(identify.stdin);
var chunks = [];
identify.stdout.on('data',function(chunk) {
chunks.push(chunk);
});
identify.stdout.on('end',function() {
var size = getSize(Buffer.concat(chunks)); //width
var convert = spawn('convert',['-','-scale',size * 0.5,'png:-']);
inputStream.pipe(convert.stdin);
convert.stdout.pipe(fs.createWriteStream('half.png'));
});
function getSize(buffer){
return parseInt(buffer.toString().split(' ')[2].split('x')[0]);
}
Request complains about this
Error: You cannot pipe after data has been emitted from the response.
and changing the inputStream to fs.createWriteStream yields the same issue of course.
I don't want to write into a file but reuse in some way the stream that request produces (or any other for that matter).
Is there a way to reuse a readable stream once it finishes piping?
What would be the best way to accomplish something like the above example?
You have to create duplicate of the stream by piping it to two streams. You can create a simple stream with a PassThrough stream, it simply passes the input to the output.
const spawn = require('child_process').spawn;
const PassThrough = require('stream').PassThrough;
const a = spawn('echo', ['hi user']);
const b = new PassThrough();
const c = new PassThrough();
a.stdout.pipe(b);
a.stdout.pipe(c);
let count = 0;
b.on('data', function (chunk) {
count += chunk.length;
});
b.on('end', function () {
console.log(count);
c.pipe(process.stdout);
});
Output:
8
hi user
The first answer only works if streams take roughly the same amount of time to process data. If one takes significantly longer, the faster one will request new data, consequently overwriting the data still being used by the slower one (I had this problem after trying to solve it using a duplicate stream).
The following pattern worked very well for me. It uses a library based on Stream2 streams, Streamz, and Promises to synchronize async streams via a callback. Using the familiar example from the first answer:
spawn = require('child_process').spawn;
pass = require('stream').PassThrough;
streamz = require('streamz').PassThrough;
var Promise = require('bluebird');
a = spawn('echo', ['hi user']);
b = new pass;
c = new pass;
a.stdout.pipe(streamz(combineStreamOperations));
function combineStreamOperations(data, next){
Promise.join(b, c, function(b, c){ //perform n operations on the same data
next(); //request more
}
count = 0;
b.on('data', function(chunk) { count += chunk.length; });
b.on('end', function() { console.log(count); c.pipe(process.stdout); });
You can use this small npm package I created:
readable-stream-clone
With this you can reuse readable streams as many times as you need
For general problem, the following code works fine
var PassThrough = require('stream').PassThrough
a=PassThrough()
b1=PassThrough()
b2=PassThrough()
a.pipe(b1)
a.pipe(b2)
b1.on('data', function(data) {
console.log('b1:', data.toString())
})
b2.on('data', function(data) {
console.log('b2:', data.toString())
})
a.write('text')
I have a different solution to write to two streams simultaneously, naturally, the time to write will be the addition of the two times, but I use it to respond to a download request, where I want to keep a copy of the downloaded file on my server (actually I use a S3 backup, so I cache the most used files locally to avoid multiple file transfers)
/**
* A utility class made to write to a file while answering a file download request
*/
class TwoOutputStreams {
constructor(streamOne, streamTwo) {
this.streamOne = streamOne
this.streamTwo = streamTwo
}
setHeader(header, value) {
if (this.streamOne.setHeader)
this.streamOne.setHeader(header, value)
if (this.streamTwo.setHeader)
this.streamTwo.setHeader(header, value)
}
write(chunk) {
this.streamOne.write(chunk)
this.streamTwo.write(chunk)
}
end() {
this.streamOne.end()
this.streamTwo.end()
}
}
You can then use this as a regular OutputStream
const twoStreamsOut = new TwoOutputStreams(fileOut, responseStream)
and pass it to to your method as if it was a response or a fileOutputStream
If you have async operations on the PassThrough streams, the answers posted here won't work.
A solution that works for async operations includes buffering the stream content and then creating streams from the buffered result.
To buffer the result you can use concat-stream
const Promise = require('bluebird');
const concat = require('concat-stream');
const getBuffer = function(stream){
return new Promise(function(resolve, reject){
var gotBuffer = function(buffer){
resolve(buffer);
}
var concatStream = concat(gotBuffer);
stream.on('error', reject);
stream.pipe(concatStream);
});
}
To create streams from the buffer you can use:
const { Readable } = require('stream');
const getBufferStream = function(buffer){
const stream = new Readable();
stream.push(buffer);
stream.push(null);
return Promise.resolve(stream);
}
What about piping into two or more streams not at the same time ?
For example :
var PassThrough = require('stream').PassThrough;
var mybiraryStream = stream.start(); //never ending audio stream
var file1 = fs.createWriteStream('file1.wav',{encoding:'binary'})
var file2 = fs.createWriteStream('file2.wav',{encoding:'binary'})
var mypass = PassThrough
mybinaryStream.pipe(mypass)
mypass.pipe(file1)
setTimeout(function(){
mypass.pipe(file2);
},2000)
The above code does not produce any errors but the file2 is empty

Resources