Transform stream stops processing early - node.js

I'm trying to extend the node.js Transform stream twice, once to split stdin into new lines, and another to alternate lowercase and uppercase.
SplitLines is working as intended, but AlternateUppercase is not.
index.js
const { Transform } = require('stream')
class SplitLines extends Transform {
_transform(chunk, encoding, callback) {
const parsed = chunk.toString().trim()
const results = parsed.split('\n')
results.forEach((line) => {
this.push(`${line}\n`)
})
}
}
class AlternateUppercase extends Transform {
constructor(options) {
super(options)
this.isEven = false
}
_transform(chunk, encoding, callback) {
const line = chunk.toString()
const altered = this.isEven ? line.toUpperCase() : line
this.push(`${altered}\n`)
this.isEven = !this.isEven
}
}
process.stdin
.pipe(new SplitLines())
.pipe(new AlternateUppercase())
.pipe(process.stdout)
How I'm testing
echo -e 'one\ntwo\nthree' | node index.js
What I'm seeing in the terminal
one
What I would expect to see
one
TWO
three
Am I doing something wrong which is causing

In the node documentation, you are required to call the callback function in order to receive the next chunk. This is not so apparent in the SplitLines function because you passed the whole string as a single chunk. However, when you do the push operation repeatedly SplitLines, you are sending multiple chunks, so you need to call the callback function.
const { Transform } = require('stream')
class SplitLines extends Transform {
_transform(chunk, encoding, callback) {
const parsed = chunk.toString().trim()
const results = parsed.split('\n')
results.forEach((line) => {
this.push(`${line}\n`)
})
}
}
class AlternateUppercase extends Transform {
constructor(options) {
super(options)
this.isEven = false
}
_transform(chunk, encoding, callback) {
const line = chunk.toString()
const altered = this.isEven ? line.toUpperCase() : line
this.push(`${altered}\n`)
callback() //must call callback to receive next chunk
this.isEven = !this.isEven
}
}
process.stdin
.pipe(new SplitLines())
.pipe(new AlternateUppercase())
.pipe(process.stdout)
The output would be:
one
TWO
three
The multiple new lines appeared because you \n at both of the transformer. To fix this, you need to add \n only once, ie at SplitLines or AlternateUppercase only.

Related

Node.js - how to process readStream for pipeline

Problem
I'm trying to recursively walk a directory and write its paths to a file using readdirp module and Node's pipeline.
const readdirp = require('readdirp')
const zlib = require('zlib')
const { pipeline } = require('stream')
let w_stream = fs.createWriteStream('C:/test/paths.txt')
let r_stream = readdirp('C:/Program Files')
pipeline(
r_stream,
zlib.createGzip(),
w_stream,
(error) => {
if (error) {
console.error('Pipeline failed', error)
}
else {
console.log('Pipeline succeeded')
}
}
)
The r_stream outputs objects like {fullPath: String, stats: Object} but the pipe expects strings. So, I get the following error:
The "chunk" argument must be of type string or an instance of Buffer or Uint8Array. Received an instance of Object
Question
How do I process r_stream data chunks so it returns entry.fullPath (strings) instead of entry (object) into the pipeline? I tried adding the following, but I get the same error:
...
r_stream.on('data', (entry) => {
w_stream.write(`${entry.fullPath}\n`)
})
pipeline(
...
How do I properly return the entry.fullPath string from r_stream.on('data'... into the pipeline?
Found the solution myself. You need to use a stream transform for this:
const through2 = require('through2')
const readdirp = require('readdirp')
const zlib = require('zlib')
let w_stream = fs.createWriteStream('C:/test/paths.txt')
let r_stream = readdirp('C:/Program Files')
// note: do not use an arrow function here, otherwise 'this.push()' will be undefined
let lineTransformer = through2.obj(function(data, enc, next) {
this.push(`${data.fullPath}\n`)
next()
})
rstream
.pipe(lineTransformer)
.pipe(zlib.createGzip())
.pipe(wstream)

Simple buffering duplex stream in Node - how to?

I'm trying to implement a duplex stream with a buffering capabilities.
It should accumulate chunks of data until there's enough them, and only then send them further.
It can be used, for example, when playing streamed audio/video data: one doesn't simply get frames in time, right?
Below is my silly attempt to create such a buffering duplex stream. There's a source stream, which sends x\n characters to the buffering stream, which in its turn should send data further to process.stdout.
Alas, it doesn't work. Specifically, the read() function doesn't seem to have any ways to pause or to stop, like:
"Hey, I don't have any data for you now, come back later".
Nah, once I return undefined or null, the story ends and nothing comes out to the stdout.
var {Readable, Duplex} = require('stream');
// Source stream, seeds: x\n, x\n, x\n, ...
let c = 10;
var rs = new Readable({
read () {
if (c > 0) {
c--;
console.log('rs reading:', 'x');
this.push('x\n');
}
else {
this.push(null)
}
},
});
// Buffering duplex stream
// I want it to cache 3 items and only then to proceed
const queue = [];
const limit = 3;
var ds = new Duplex({
writableHighWaterMark: 0,
write (chunk, encoding, callback) {
console.log('ds writing:', chunk, 'paused: ', ds.isPaused());
queue.push(chunk);
callback();
},
readableHighWaterMark: 0,
read () {
// We don't want to output anything
// until there's enough elements in the `queue`.
if (queue.length >= limit) {
const chunk = queue.shift();
console.log('ds reading:', chunk);
this.push(chunk);
}
else {
// So how to wait here?
this.push(undefined)
}
},
});
// PROBLEM: nothing is coming out of the "ds" and printed on the stdout
rs.pipe(ds).pipe(process.stdout);
Here is my repl: https://repl.it/#OnkelTem/BufferingStream1-1#index.js
I checked the state of the duplex and it's not even in the paused state. So it's not paused, it's flowing, and yet — returns nothing.
I also spent a couple of hours re-reading documentation on the Node streams, but it doesn't actually feel like it was created for understanding.
A buffering stream is just a type of a transform stream. If I understand what you're trying to do properly, the implementation shouldn't be any more complicated than this:
const { Transform } = require('stream');
const DEFAULT_CAPACITY = 10;
class BufferingTransform extends Transform {
constructor(options = {}) {
super(options);
this.capacity = options.capacity || DEFAULT_CAPACITY ;
this.pending = [] ;
return;
}
get atCapacity() {
return this.pending.length >= this.capacity;
}
_transform(chunk, encoding, cb) {
if ( this.atCapacity ) {
this.push( ...this.pending.shift() );
}
this.pending.push( [chunk, encoding] );
cb();
}
_flush(cb) {
while (this.pending.length > 0) {
this.push( ...this.pending.shift() );
}
cb();
}
}
Once you have that, it should be just a matter of piping your source through the BufferingStream and reading from the BufferingStream`:
async function readFromSource() {
const source = openSourceForReading();
const buffer = new BufferingStream();
source.pipe(buffer);
for await (const chunk of buffer) {
console.log(chunk);
}
}
Here's an implementation using async iterables:
function bufferStream(stream, bufferCount){
stream = normalizeAsyncIterable(stream);
const iterator = stream[Symbol.asyncIterator]();
const queue = []
while(queue.length < bufferCount)
queue.push(iterator.next());
return normalizeAsyncIterable({
[Symbol.asyncIterator]: () => ({
next: async () => {
const promise = queue.shift() ?? iterator.next();
while(queue.length < bufferCount)
queue.push(iterator.next());
return promise;
}
})
});
}
// Ensures that calls to .next() while the generator is paused are handled correctly
async function* normalizeAsyncIterable(iterable){
for await(const value of iterable)
yield value;
}
TS Playground Link

Using csv-parse with highlandjs

I would like to do a bit of parsing on csv files to convert them to JSON and extract data out of them. I'm using highland as a stream processing library. I am creating an array of csv parsing streams using
import { readdir as readdirCb, createReadStream } from 'fs';
import { promisify } from 'util';
import _ from 'highland';
import parse from 'csv-parse';
const readdir = promisify(readdirCb);
const LOGS_DIR = './logs';
const options = '-maxdepth 1';
async function main() {
const files = await readdir(LOGS_DIR)
const stream = _(files)
.map(filename => createReadStream(`${LOGS_DIR}/${filename}`))
.map(parse)
}
main();
I have tried to use stream like:
const stream = _(files)
.map(filename => createReadStream(`${LOGS_DIR}/${filename}`))
.map(parse)
.each(stream => {
stream.on('parseable', () => {
let record
while (record = stream.read()) { console.log(record) }
})
})
This does not produce any records. I am not sure as how to proceed and receive the JSON for each row for each CSV file.
EDIT:
Writing a function like this works for an individual file:
import parse from 'csv-parse';
import transform from 'stream-transform';
import { createReadStream } from 'fs';
export default function retrieveApplicationIds(filename) {
console.log('Parsing file', filename);
return createReadStream(filename).pipe(parser).pipe(getApplicationId).pipe(recordUniqueId);
}
Edit 2:
I have tried using the concat streams approach:
const LOGS_DIR = './logs';
function concatStreams(streamArray, streamCounter = streamArray.length) {
streamArray.reduce((mergedStream, stream) => {
// pipe each stream of the array into the merged stream
// prevent the automated 'end' event from firing
mergedStream = stream.pipe(mergedStream, { end: false });
// rewrite the 'end' event handler
// Every time one of the stream ends, the counter is decremented.
// Once the counter reaches 0, the mergedstream can emit its 'end' event.
stream.once('end', () => --streamCounter === 0 && mergedStream.emit('end'));
return mergedStream;
}, new PassThrough());
}
async function main() {
const files = await readdir(LOGS_DIR)
const streams = files.map(parseFile);
const combinedStream = concatStreams(streams);
combinedStream.pipe(process.stdout);
}
main();
When I use this, I get the error:
(node:1050) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 unpipe listeners added to [Transformer]. Use emitter.setMaxListeners() to increase limit

How to use readline in NodeJs on an event (wait stream close)

I'm creating a bot that when it receives a message begins to read a text file and responds to the message with the contents of the file.
Unfortunately, I can't get out of this asynchronous hell and I only get errors, undefined or promise
The last experiment was this:
const fs = require('fs');
const readline = require('readline');
// bot.listen("message").reply(responseText())
function readFile(file) {
var text = '';
var readInterface = readline.createInterface({
input: fs.createReadStream(file),
terminal: false
});
readInterface.on('line', function(line) {
linea = line.trim();
console.log(linea);
text += linea;
}).on('close', function() {
return text;
});
});
}
async function responseText() {
var content = await readFile("file.txt");
content.then(function(data) {
return data;
})
}
What I would like to get then is delay the response until I get the contents of the file.
I know that node is based on async but I can't figure out how to handle it!
Thanks all
If you want to use async-await need to create a promise and return it.
function readFile(file) {
return new Promise((res, rej) => {
try {
var text = '';
var readInterface = readline.createInterface({
input: fs.createReadStream(file),
terminal: false
});
readInterface
.on('line', function (line) {
linea = line.trim();
text += linea;
})
.on('close', function () {
res(text);
});
} catch(err){
rej(err)
}
});
}
If your using express.js or any framework built on top of it, you can simply pipe the readstream to the response since express's responses are streams to begin with:
const es = require('event-stream')
...
let getFileStream = path => (
fs.createReadStream(path)
.pipe(es.split())
.pipe(es.map(function (data, cb) {
cb(null
, inspect(JSON.parse(data)))
}))
);
router.get('/message', function (req, res, next) {
let file$ = getFileStream(yourFilePath)
file$.on('error', next).pipe(res)
})
If you need to transform the file content, you can use a transform stream or as shown in the example above, a synchronous event-stream mapping. The idea is to always play around with the file content at stream level to avoid having to load the entire file content in memory.
You don't really want to buffer the whole file content in memory. It can quickly become a problem with huge files on a busy day. what you need is to pipe the file stream directly to the browser. Same principle applies for any kind of consumer.
Of course, if the mechanism is all internal, you should only pass the file path along or the actual stream until you need to actually open the file and do something with the content. In this case, you go back to your stream toolbox, whether it be the native node.js stream API implementation, the event-stream package or some kind of observable library like rxjs.
I had a similar issue in an app that watches a directory for new files, reads the file(s) and returns derived data based on the file content. My Reader function is based on this async example from the nodejs docs. I return options, which contains the context, only after the file is read completely.
const { createReadStream } = require('fs')
const { createInterface } = require('readline')
const { once } = require('events')
// Reader.js
async function Reader (options) {
let { file, cb } = options
let fileStream = createReadStream(file)
const readInterface = createInterface({
input: fileStream,
crlfDelay: Infinity
})
readInterface.on('line', (line) => {
cb(line)
})
await once(readInterface, 'close')
return options
}
module.exports = Reader
I then have a file which imports my Reader and defines how to use it. I define a callback function to pass to the line event listener. I bind the callback to the options object that I pass to my Reader function. In the
readFile function I make sure to return the call to Reader, which is a Promise.
/**
* #desc callback to instruct what to do with each line read
*
* #param {*} line
*/
const readFileLine = function (line) {
linea = line.trim();
console.log(linea);
text += linea;
this.context += linea
}
/**
* #desc once the added file is ready to be processed read file line by line
* #listens {Event} listens for `process` event
*/
const readFile = (options) => {
return Reader(options)
}
/**
* #desc Call the file reader and do what you need with the reponse
*
*/
const getResponseFromFiles = (file) => {
const opts = {}
opts.cb = readFileLine.bind(opts)
opts.context = ''
opts.file = file
readFile(opts)
.then(data => {
process.exitCode = 0
console.log(data)
return data
})
.catch(err => {
process.exitCode = 1
console.log(err.message)
})
}

What is the "reactive" way to read file line-by-line

I'm learning reactive programming using RxJS and encounter a case when I need to read a file line-by-line. Actually I solved it using a solution likes:
https://gist.github.com/yvele/447555b1c5060952a279
It works, but I need to use some normal JS code to transform the stream of Buffers to stream of lines. (use "readline" module in example above)
I wonder if there are other ways to transform an Observable of Buffer to Observable of line, using RxJS operators, likes example below.
var Rx = require('rx');
var fs = require('fs');
var lines = Rx.Observable
.fromEvent(rl, 'data') // emits buffers overtime
// some transforms ...
.subscribe(
(line) => console.log(line), // emit string line by line
err => console.log("Error: %s", err),
() => console.log("Completed")
);
You can probably achieve something pretty close to what you want with scan and concatMap.
Something like:
bufferSource
.concat(Rx.Observable.of("\n")) // parens was missing // to make sure we don't miss the last line!
.scan(({ buffer }, b) => {
const splitted = buffer.concat(b).split("\n");
const rest = splitted.pop();
return { buffer: rest, items: splitted };
}, { buffer: "", items: [] })
// Each item here is a pair { buffer: string, items: string[] }
// such that buffer contains the remaining input text that has no newline
// and items contains the lines that have been produced by the last buffer
.concatMap(({ items }) => items)
// we flatten this into a sequence of items (strings)
.subscribe(
item => console.log(item),
err => console.log(err),
() => console.log("Done with this buffer source"),
);
You can use following class
'use strict'
const lineReader = require('line-reader');
const Rx = require('rxjs');
const RxOp = require('rxjs/operators');
class CSVReader {
constructor(filepath {
this.filepath = filepath;
}
readByLines()
{
const source = new Rx.Subject();
lineReader.open(this.filepath, (err, reader)=> {
Rx.of(0).pipe(
RxOp.expand(val => {
reader.nextLine((err2, line) => source.next(line));
return Rx.of(1 + val);
}),
RxOp.takeWhile(_=> {
let has = reader.hasNextLine();
if(!has) source.complete();
return has;
})
).subscribe(_=>_);
})
return source;
}
}
module.exports = CSVReader
and use it as follows
const { bufferCount } = require('rxjs/operators');
let reader = new CSVReader('path/to/file');
reader.readByLines()
.pipe(bufferCount(2)) // chunk size
.subscribe(chunk=> {
console.log({chunk});
});
I would say like this:
const readline = require('readline');
const fs = require('fs');
const path = require('path');
const {fromEvent, race, Observable} = require('rxjs');
const {tap, takeUntil, take, map} = require('rxjs/operators');
const rl = readline.createInterface({
input: fs.createReadStream(path.resolve('./', 'myfile'))
});
let obs = new Observable(observer=>{
rl.on('line', val => observer.next(val)),
rl.on('error', err => observer.error(err)),
rl.on('close', complete => observer.complete(complete))
})
.pipe(tap(line=>console.log(`line: ${line}`)))
obs.subscribe(()=>{},
(e)=>console.log(`Error reading file: ${e}`),
()=>console.log("Read complete"))
An alternative for creating the observable could be:
let obs = fromEvent(rl, 'line')
.pipe(
takeUntil(race(
fromEvent(rl, 'close').pipe(take(1)) ,
fromEvent(rl, 'error').pipe(map((err)=>{throw err}))
)))
Ideally, rxjs could have provided an operator like: fromEvent(emitter, nextEvent, errorEvent, completeEvent ) to help keep the above code even simpler.
I tried a bunch of the above answers and built my own ugly version. Then, I poked around the code on GitHub and found that RxJS handles stream like objects - there's no point in mucking around with events. Just pass a ReadStream to from and it tests it for ReadableStreamLike and then turns it into an AsyncGenerator.
import * as readline from 'node:readline';
import { from } from 'rxjs';
const file = fs.createReadStream(fileName);
const line = readline.createInterface({ input: file });
const line$ = from(line).subscribe({
next: (dat) => { ... },
error: (err) => { ... },
complete: () => { ... }
});

Resources