Node: pipeline not blocking on paused passthrough

Node: pipeline not blocking on paused passthrough - node.js

One of the base behaviour of node's stream is to block when writing on a paused stream, and any non piped stream is blocked.
In this example, the created PassThrough is not piped to anything in it's creation event loop. One would expect any pipeline run on this PassThrough to block until it is piped / a data event is attached, but this is not the case.
The pipeline callbacks, but nothing is consumed.
const {promises: pFs} = require('fs');
const fs = require('fs');
const {PassThrough} = require('stream');
const {pipeline: pipelineCb} = require('stream');
const util = require('util');
const pipeline = util.promisify(pipelineCb);
const path = require('path');
const assert = require('assert');
/**
* Start a test ftp server
* #param {string} outputPath
* #return {Promise<void>}
*/
function myCreateWritableStream (outputPath) {
// The stream is created in paused mode -> should block until piped
const stream = new PassThrough();
(async () => {
// Do some stuff (create directory / check space / connect...)
await new Promise(resolve => setTimeout(resolve, 500));
console.log('piping passThrough to finale output');
// Consume the stream
await pipeline(stream, fs.createWriteStream(outputPath));
console.log('passThrough stream content written');
})().catch(e => {
console.error(e);
stream.emit('error', e);
});
return stream;
}
/**
* Main test function
* #return {Promise<void>}
*/
async function main () {
// Prepare the test directory with a 'tmp1' file only
const smallFilePath = path.join(__dirname, 'tmp1');
const smallFileOut = path.join(__dirname, 'tmp2');
await Promise.all([
pFs.writeFile(smallFilePath, 'a small content'),
pFs.unlink(smallFileOut).catch(e => assert(e.code === 'ENOENT'))
]);
// Duplicate the tmp1 file to tmp2
await pipeline([
fs.createReadStream(smallFilePath),
myCreateWritableStream(smallFileOut)
]);
console.log('pipeline ended');
// Check content
const finalContent = await pFs.readdir(__dirname);
console.log('directory content');
console.log(finalContent.filter(file => file.startsWith('tmp')));
}
main().catch(e => {
process.exitCode = 1;
console.error(e);
});
This code output the following lines:
pipeline ended
directory content
[ 'tmp1' ]
piping passThrough to finale output
passThrough stream content written
If the pipeline really waited for the stream to end, then the output would be this one:
piping passThrough to finale output
passThrough stream content written
pipeline ended
directory content
[ 'tmp1', 'tmp2' ]
How can you explain this behaviour ?

I don't think the API gives the guarantees you are looking for here.
The stream.pipeline calls its callback after all data has finished writing. Since the data has been written to a new Transform stream (your Passthrough), and that stream has nowhere to put the data yet, it simply gets stored in the stream's internal buffer. That is good enough for the pipeline.
If you were to read a large enough file, filling the Transform stream's buffer, the stream backpressure can automatically trigger a pause() on the readable that is reading a file. Once the Transform stream drains, it will automatically unpause() the readable so data flow resumes.
I think your example makes two incorrect assumptions:
(1) That you can pause a transform stream. According to the stream docs, pausing any stream that is piped to a destination is ineffective, because it will immediately unpause itself as soon as a piped destination asks for more data. Also, a paused transform stream still reads data! A paused stream just doesn't write data.
(2) That a pause further down a pipeline somehow propagates up to the front of a pipeline and causes data to stop flowing. This is only true if caused by backpressure, meaning, you would need to trigger node's detection of a full internal buffer.
When working with pipes, it's best to assume you have manual control over the two farthest ends, but not necessarily of any of the pieces in the middle. (You can manually pipe() and unpipe() to connect and disconnect intermediate streams, but you can't pause them.)

Related

How do I stream a chunked file using Node.js Readable?

I have a 400Mb file split into chunks that are ~1Mb each.
Each chunk is a MongoDB document:
{
name: 'stuff.zip',
index: 15,
buffer: Binary('......'),
totalChunks: 400
}
I am fetching each chunk from my database and then streaming it to the client.
Every time I get chunk from the DB I push it to the readableStream which is being piped to the client.
Here is the code:
import { Readable } from 'stream'
const name = 'stuff.zip'
const contentType = 'application/zip'
app.get('/api/download-stuff', (req, res) => {
res.set('Content-Type', contentType)
res.set('Content-Disposition', `attachment; filename=${name}`)
res.attachment(name)
// get `totalChunks` from random chunk
let { totalChunks } = await ChunkModel.findOne({ name }).select('totalChunks')
let index = 0
const readableStream = new Readable({
async read() {
if (index < totalChunks) {
let { buffer } = await ChunkModel.findOne({ name, index }).select('buffer')
let canContinue = readableStream.push(buffer)
console.log(`pushed chunk ${index}/${totalChunks}`)
index++
// sometimes it logs false
// which means I should be waiting before pushing more
// but I don't know how
console.log('canContinue = ', canContinue)
} else {
readableStream.push(null)
readableStream.destroy()
console.log(`all ${totalChunks} chunks streamed to the client`)
}
}
})
readableStream.pipe(res)
})
The code works.
But I'm wondering whether I risk having memory overflows on my local server memory, especially when the requests for the same file are too many or the chunks are too many.
Question: My code is not waiting for readableStream to finish reading the chunk that was just pushed to it, before pushing the next one. I thought it was, and that is why I'm using read(){..} in this probably wrong way. So how should I wait for each chunk to be pushed, read, streamed to the client and cleared from my server's local memory, before I push the next one in ?
I have created this sandbox in case it helps anyone

In general, when the readable interface is implemented correctly (i.e., the backpressure signal is respected), the readable interface will prevent the code from overflowing the memory regardless of source size.
When implemented according to the API spec, the readable itself does not keep references for data that has finished passing through the stream. The memory requirement of a readable buffer is adjusted by specifying a highWatermark.
In this case, the snippet does not conform to the readable interface. It violates the following two concepts:
No data shall be pushed to the readable's buffer unless read() has been called. Currently, this implementation proceeds to push data from DB immediately. Consequently, the readable buffer will start to fill before the sink has begun to consume data.
The readable's push() method returns a boolean flag. When the flag is false, the implementation must wait for .read() to be called before pushing additional data. If the flag is ignored, the buffer will overflow wrt. the highWatermark.
Note that ignoring these core criteria of Readables circumvents the backpressure logic.
An alternative implementation, if this is a Mongoose query:
app.get('/api/download-stuff', async (req, res) => {
// ... truncated handler
// A helper variable to relay data from the stream to the response body
const passThrough = new stream.PassThrough({objectMode: false});
// Pipe data using pipeline() to simplify handling stream errors
stream.pipeline(
// Create a cursor that fetch all relevant documents using a single query
ChunkModel.find().limit(chunksLength).select("buffer").sort({index: 1}).lean().cursor(),
// Cherry pick the `buffer` property
new stream.Transform({
objectMode: true,
transform: ({ buffer }, encoding, next) => {
next(null, buffer);
}
}),
// Write the retrieved documents to the helper variable
passThrough,
error => {
if(error){
// Log and handle error. At this point the HTTP headers are probably already sent,
// and it is therefore too late to return HTTP500
}
}
);
res.body = passThrough;
});

NodeJS Stream flushed during the Event Loop iteration

I'm trying to pipe one Stream Axios Response into multiple files. It's not working, and I can reproduce it with the simple code below:
Will work:
const { PassThrough } = require('stream')
const inputStream = new PassThrough()
inputStream.write('foo')
// Now I have a stream with content
inputStream.pipe(process.stdout)
inputStream.pipe(process.stderr)
// will print 'foofoo', for both stdout and stderr
Will not work:
const { PassThrough } = require('stream')
const inputStream = new PassThrough()
inputStream.write('foo')
inputStream.pipe(process.stdout)
setImmediate(() => {
inputStream.pipe(process.stderr)
})
// Will print only 'foo'
The question is, Can I say that the existed content in the stream will be piped only if the two pipe commands will execute in the same Event-Loop iteration?
Doesn't that make the situation non-deterministic?

By the time the callback scheduled with setImmediate is executed, the stream data is already flushed. This can checked by .readableLength stream property.
You can use cork and uncork in order to control when the buffered stream data is flushed.
const { PassThrough } = require('stream')
const inputStream = new PassThrough()
inputStream.cork()
inputStream.write('foo')
inputStream.pipe(process.stdout)
setImmediate(() => {
inputStream.pipe(process.stderr)
inputStream.uncork()
})

Can I don't create a file with writeFileStream?

I create a writeFileStream and pipe it with readableStream.
When on data, I check length of data to if length too short, don't create a file with writeFileStream.
Can I abort create a file with writeFileStream, or unlink the file after file created?
Thanks for your help.
const fs = require('fs')
const { ReadableMock } = require('stream-mock')
const { assert } = require('chai')
describe.only('fs', () => {
const expectedPath = './file.txt'
const input = 'abc'
const reader = new ReadableMock(input)
const writer = fs.createWriteStream(expectedPath)
before((done) => {
let index = 0
reader.pipe(writer)
reader.on('data', () => {
index++
if (index === 1) {
reader.unpipe(writer)
done()
}
})
})
after(() => {
fs.unlinkSync('./file.txt')
})
it('should not create file', () => {
assert.isFalse(fs.existsSync(expectedPath)) // expected true to be false.
})
})

In order to achieve what you're trying to achieve I'd create a PassThrough stream and use highWaterMark to tell me when the stream has been filled - you won't need much code and the streams will give you so little overhead you won't notice (not with writing to disk or reading from HTTP). ;)
Here's what I'd do:
const reader = new ReadableMock(input)
const checker = new PassThrough({
highWaterMark: 4096 // or how many bytes you need to gather first
});
reader
.once('pause', () => checker.pipe(fs.createWriteStream(expectedPath)))
.pipe(checker);
What happens here is:
reader is piped to checker which is not connected to anything, but has it's highWaterMark level of bytes that it allows (you may add encoding there to use chars instead of bytes)
checker is paused, but on pipe reader will unpause and try to write as much as it can
checker will accept some data before returning false on write that will emit pause event on reader
the listener only now creates the writer and it's underlying file and pipes the checker
the checker gets unpaused and so gets reader
If the number of bytes is lower than highWaterMark, pause will not be emitted on reader and so the file won't get created.
Mind you - you may need to close connections and clean up if this is not a mock, otherwise you may leave those hanging and waiting to be read and soon you'll exhaust incoming connection limits or available memory.

How to keep the request open to use the write() method after a long time

I need to keep the connection open so after I finish the music I write the new data. The problem is that the way I did, the stream simply stops after the first song.
How can I keep the connection open and play the next songs too?
const fs = require('fs');
const express = require('express');
const app = express();
const server = require('http').createServer(app)
const getMP3Duration = require('get-mp3-duration')
let sounds = ['61880.mp3', '62026.mp3', '62041.mp3', '62090.mp3', '62257.mp3', '60763.mp3']
app.get('/current', async (req, res) => {
let readStream = fs.createReadStream('sounds/61068.mp3')
let duration = await getMP3Duration(fs.readFileSync('sounds/61068.mp3'))
let pipe = readStream.pipe(res, {end: false})
async function put(){
let file_path = 'sounds/'+sounds[Math.random() * sounds.length-1]
duration = await getMP3Duration(fs.readFileSync(file_path))
readStream = fs.createReadStream(file_path)
readStream.on('data', chunk => {
console.log(chunk)
pipe.write(chunk)
})
console.log('Current Sound: ', file_path)
setTimeout(put, duration)
}
setTimeout(put, duration)
})
server.listen(3005, async function () {
console.log('Server is running on port 3005...')
});

You should use a library or look at the source code and see what they do.
A good one is:
https://github.com/obastemur/mediaserver
TIP:
Always start your research by learning from other projects.. (When possible or when you are not inventing the wheel ;)) you are not the first to do so or to hit this problem :)
a quick search with the phrase "nodejs stream mp3 github" gave me few directions..
Good luck !

Express works by returning a single response to a single request. As soon as the request has been sent, a new request needs to be generated to trigger a new response.
In your case however you want to keep on generating new responses out of a single request.
Two approaches can be used to solve your problem:
Change the way you create your response to satisfy your use-case.
use an instantaneous communication framework (websocket). The best and simplest which comes to my mind is socket.io
Adapting express
The solution here is to follow this procedure:
Request on endpoint /current comes in
The audio sequence is prepared
The stream of the entire sequence is returned
So your handler would look like that:
const fs = require('fs');
const express = require('express');
const app = express();
const server = require('http').createServer(app);
// Import the PassThrough class to concatenate the streams
const { PassThrough } = require('stream');
// The array of sounds now contain all the sounds
const sounds = ['61068.mp3','61880.mp3', '62026.mp3', '62041.mp3', '62090.mp3', '62257.mp3', '60763.mp3'];
// function which concatenate an array of streams
const concatStreams = streamArray => {
let pass = new PassThrough();
let waiting = streamArray.length;
streamArray.forEach(soundStream => {
pass = soundStream.pipe(pass, {end: false});
soundStream.once('end', () => --waiting === 0 && pass.emit('end'));
});
return pass;
};
// function which returns a shuffled array
const shuffle = (array) => {
const a = [...array]; // shallow copy of the array
for (let i = a.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[a[i], a[j]] = [a[j], a[i]];
}
return a;
};
server.get('/current', (req, res) => {
// Start by shuffling the array
const shuffledSounds = shuffle(sounds);
// Create a readable stream for each sound
const streams = shuffledSounds.map(sound => fs.createReadStream(`sounds/${sound}`));
// Concatenate all the streams into a single stream
const readStream = concatStreams(streams);
// This will wait until we know the readable stream is actually valid before piping
readStream.on('open', function () {
// This just pipes the read stream to the response object (which goes to the client)
// the response is automatically ended when the stream emits the "end" event
readStream.pipe(res);
});
});
Notice that the function does not require the async keyword any longer. The process is still asynchronous but the coding is emitter based instead of promise based.
If you want to loop the sounds you can create additional steps of shuffling/mapping to stream/concatenation.
I did not include the socketio alternative as to keep it simple.

Final Solution After a Few Edits:
I suspect your main issue is with your random array element generator. You need to wrap what you have with Math.floor to round down to ensure you end up with a whole number:
sounds[Math.floor(Math.random() * sounds.length)]
Also, Readstream.pipe returns the destination, so what you're doing makes sense. However, you might get unexpected results with calling on('data') on your readable after you've already piped from it. The node.js streams docs mention this. I tested out your code on my local machine and it doesn't seem to be an issue, but it might make sense to change this so you don't have problems in the future.
Choose One API Style
The Readable stream API evolved across multiple Node.js versions and provides multiple methods of consuming stream data. In general, developers should choose one of the methods of consuming data and should never use multiple methods to consume data from a single stream. Specifically, using a combination of on('data'), on('readable'), pipe(), or async iterators could lead to unintuitive behavior.
Instead of calling on('data') and res.write, I would just pipe from the readStream into the res again. Also, unless you really want to get the duration, I would pull that library out and just use the readStream.end event to make additional calls to put(). This works because you're passing the false option when piping, which disables the default end event functionality on the write stream and leaves it open. However, it still gets emitted, so you can use that as a marker to know when the readable has finished piping. Here's the refactored code:
const fs = require('fs');
const express = require('express');
const app = express();
const server = require('http').createServer(app)
//const getMP3Duration = require('get-mp3-duration') no longer needed
let sounds = ['61880.mp3', '62026.mp3', '62041.mp3', '62090.mp3', '62257.mp3', '60763.mp3']
app.get('/current', async (req, res) => {
let readStream = fs.createReadStream('sounds/61068.mp3')
let duration = await getMP3Duration(fs.readFileSync('sounds/61068.mp3'))
let pipe = readStream.pipe(res, {end: false})
function put(){
let file_path = 'sounds/'+sounds[Math.floor(Math.random() * sounds.length)]
readStream = fs.createReadStream(file_path)
// you may also be able to do readStream.pipe(res, {end: false})
readStream.pipe(pipe, {end: false})
console.log('Current Sound: ', file_path)
readStream.on('end', () => {
put()
});
}
readStream.on('end', () => {
put()
});
})
server.listen(3005, async function () {
console.log('Server is running on port 3005...')
});

Proper way to consume NodeJS stream into buffer and write stream

I have a need to pipe a readable stream into both a buffer (to be converted into a string) and a file. The stream is coming from node-fetch.
NodeJS streams have two states: paused and flowing. From what I understand, as soon as a 'data' listener is attached, the stream will change to flowing mode. I want to make sure the way I am reading a stream will not lose any bytes.
Method 1: piping and reading from 'data':
fetch(url).then(
response =>
new Promise(resolve => {
const buffers = []
const dest = fs.createWriteStream(filename)
response.body.pipe(dest)
response.body.on('data', chunk => buffers.push(chunk))
dest.on('close', () => resolve(Buffer.concat(buffers).toString())
})
)
Method 2: using passthrough streams:
const { PassThrough } = require('stream')
fetch(url).then(
response =>
new Promise(resolve => {
const buffers = []
const dest = fs.createWriteStream(filename)
const forFile = new PassThrough()
const forBuffer = new PassThrough()
response.body.pipe(forFile).pipe(dest)
response.body.pipe(forBuffer)
forBuffer.on('data', chunk => buffers.push(chunk))
dest.on('close', () => resolve(Buffer.concat(buffers).toString())
})
)
Is the second method required so there is no lost data? Is the second method wasteful since two more streams could be buffered? Or, is there another way to fill a buffer and write stream simultaneously?

You won't miss any data, since .pipe internally calls src.on('data') and writes any chunk to the target stream.
So any chunk written to your dest stream, will also be emitted to response.body.on('data') where you're buffering the chunks.
In any case, you should listen to 'error' events and reject if any error occurs.
And While your second mode will work, you don't need it.
This is a chunk of code from the .pipe function
src.on('data', ondata);
function ondata(chunk) {
debug('ondata');
var ret = dest.write(chunk);
debug('dest.write', ret);
if (ret === false) {
// If the user unpiped during `dest.write()`, it is possible
// to get stuck in a permanently paused state if that write
// also returned false.
// => Check whether `dest` is still a piping destination.
if (((state.pipesCount === 1 && state.pipes === dest) ||
(state.pipesCount > 1 && state.pipes.indexOf(dest) !== -1)) &&
!cleanedUp) {
debug('false write response, pause', state.awaitDrain);
state.awaitDrain++;
}
src.pause();
}
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string