Asynchronous file read reading different number of lines each time, not halting - node.js

I built a simple asynchronous implementation of the readlines module built into nodejs, which is simply a wrapper around the event-based module itself. The code is below;
const readline = require('readline');
module.exports = {
createInterface: args => {
let self = {
interface: readline.createInterface(args),
readLine: () => new Promise((succ, fail) => {
if (self.interface === null) {
succ(null);
} else {
self.interface.once('line', succ);
}
}),
hasLine: () => self.interface !== null
};
self.interface.on('close', () => {
self.interface = null;
});
return self;
}
}
Ideally, I would use it like so, in code like this;
const readline = require("./async-readline");
let filename = "bar.txt";
let linereader = readline.createInterface({
input: fs.createReadStream(filename)
});
let lines = 0;
while (linereader.hasLine()) {
let line = await linereader.readLine();
lines++;
console.log(lines);
}
console.log("Finished");
However, i've observed some erratic and unexpected behavior with this async wrapper. For one, it fails to recognize when the file ends, and simply hangs once it reaches the last line, never printing "Finished". And on top of that, when the input file is large, say a couple thousand lines, it's always off by a few lines and doesn't successfully read the full file before halting. in a 2000+ line file it could be off by as many as 20-40 lines. If I throw a print statement into the .on('close' listener, I see that it does trigger; however, the program still doesn't recognize that it should no longer have lines to read.

It seems that in nodejs v11.7, the readline interface was given async iterator functionality and can simply be looped through with a for await ... of loop;
const rl = readline.createInterface({
input: fs.createReadStream(filename);
});
for await (const line of rl) {
console.log(line)
}
How to get synchronous readline, or "simulate" it using async, in nodejs?

Related

Node.js stops when reading from multiple Readable streams

After creating a stream (A), creating another stream (B) and reading stream (B), the reading process stops from the stream (A).
How can I solve this problem?
Node.js v14.18.1
import * as readline from 'readline';
import { Readable } from 'stream';
async function main() {
const streamA = Readable.from('a');
const readerA = readline.createInterface({
input: streamA,
crlfDelay: Infinity
});
var stopCase = false;
if (stopCase) {
const streamB = Readable.from('b');
const readerB = readline.createInterface({
input: streamB,
crlfDelay: Infinity
});
console.log('readB');
for await (const line of readerB) {
console.log(line);
}
}
console.log(`readerA.closed = ${'closed' in readerA}`);
console.log('readA');
for await (const line of readerA) {
console.log(line);
}
console.log('success');
}
main();
Output(stopCase=true):
readB
b
readerA.closed = true
readA
Output(stopCase=false):
readerA.closed = false
readA
a
success
The issue is that as soon as you do this:
const readerA = readline.createInterface({
input: streamA,
crlfDelay: Infinity
});
Then, streamA is now ready to flow and readerA is ready to generate events as soon as you hit the event loop. When you go into the stopCase block and hit the for await (const line of readerB), that will allow streamA to flow which will allow readerA to fire events.
But, you aren't listening for the readerA events when they fire and thus it finishes the streamA content it had while you aren't listening.
You can see how it works better if you don't create readerA until after you're done with the stopCase block. Because then streamA and readerA aren't yet flowing when you hit the await inside of the stopCase block.
This is what I would call a growing pain caused by trying to add promises onto the event driven streams. If you leave the stream in a flowing state and you were going to use await to read those events, but you then await some other promise, all your events on that first stream fire when you aren't yet listening. It doesn't know you're waiting to use await on it. You set it up to flow so as soon as the interpreter hits the event loop, it starts flowing, even though you aren't listening with await.
I've run into this before in my own code and the solution is to not set a stream up to flow until you're either just about to use await to read it or until you have a more traditional event handler configured to listen to any events that flow. Basically, you can't configure two streams for use with for await (...) at the same time. Configure one stream, use it with your for await (...), then configure the other. And, be aware of any other promises used in your processing of the for await (...) loop too. There are lots of ways to goof up when using that structure.
In my opinion, it would work more reliably if a stream was actually put in a different state to be used with promises so it will ONLY flow via the promise interface. Then, this kind of thing would not happen. But, I'm sure there are many challenges with that implementation too.
For example, if you do this:
import * as readline from 'readline';
import { Readable } from 'stream';
async function main() {
var stopCase = true;
console.log(`stopCase = ${stopCase}`);
if (stopCase) {
const streamB = Readable.from('b');
const readerB = readline.createInterface({
input: streamB,
crlfDelay: Infinity
});
console.log('readB');
for await (const line of readerB) {
console.log(line);
}
}
const streamA = Readable.from('a');
const readerA = readline.createInterface({
input: streamA,
crlfDelay: Infinity
});
console.log(`streamA flowing = ${streamA.readableFlowing}`);
console.log(`readerA.closed = ${!!readerA.closed}`);
console.log('readA');
for await (const line of readerA) {
console.log(line);
}
console.log('success');
}
main();
Then, you get all the output:
stopCase = true
readB
b
streamA flowing = true
readerA.closed = false
readA
a
success
The reason you never get the console.log('success') is probably because you hit the for await (const line of readerA) { ...} loop and it gets stopped there on a promise that has no more data. Meanwhile, nodejs notices that there is nothing left in the process that can create any future events so it exits the process.
You can see that same concept in play in an even simpler app:
async function main() {
await new Promise(resolve => {
// do nothing
});
console.log('success');
}
main();
It awaits a promise that never completes and there are no event creating things left in the app so nodejs just shuts down with ever logging success.

Break and continue Node.js readline async for loop

I'm using the Node.js readline interface to read a file line-by-line using an async for-of loop. But I want to be able to control the flow and I'm not sure how to break and continue the loop where it left off.
Simplified example:
const fileStream = fs.createReadStream('input.txt')
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
})
for await (const line of rl) {
console.log(line) // This works
break
}
for await (const line of rl) {
console.log(line) // This does not print anything
}
See this replit for a complete example.
How do I use the same readline interface to continue the loop where it left off?
Async generators close the underlying stream once you exit the loop. So I went for this instead:
const it = this.lineReader[Symbol.asyncIterator]()
while (true) {
const res = await it.next()
if (res.done) break
line = res.value.trim()
}

wait for writestream to finish before executing next function

I have two functions.
The first function reads all the files in a folder and writes their data to a new file.
The second function takes that new file (output of function 1) as input and creates another file. Therefore it has to wait until the write stream of function 1 has finished.
const fs = require('fs');
const path = require('path');
function f1(inputDir, outputFile) {
let stream = fs.createWriteStream(outputFile, {flags:'a'}); // new data should be appended to outputFile piece by piece (hence flag a)
let files = await fs.promises.readdir(inputDir);
for(let file of files) {
let pathOfCurrentFile = path.join(inputDir, file);
let stat = fs.statSync(pathOfCurrentFile);
if(stat.isFile()) {
data = await fs.readFileSync(pathOfCurrentFile, 'utf8');
// now the data is being modified for output
let result = data + 'other stuff';
stream.write(result);
}
}
stream.end();
}
function f2(inputFile, outputFile) {
let newData = doStuffWithMy(inputFile);
let stream = fs.createWriteStream(outputFile);
stream.write(newData);
stream.end();
}
f1('myFiles', 'myNewFile.txt');
f2('myNewFile.txt', 'myNewestFile.txt');
Here's what happens:
'myNewFile.txt' (output of f1) is created correctly
'myNewestFile.txt' is created but is either empty or only contains one or two words (it should contain a long text)
When I use a timeout before executing f2, it works fine, but I can't use a timeout because there can be thousands of input files in the inputDir, therefore I need a way to do it dynamically.
I've experimented with async/await, callbacks, promises etc. but that stuff seems to be a little to advanced for me, I couldn't get it to work.
Is there anything else I can try?
Since you asked about a synchronous version, here's what that could look like. This should only be used in a single user script or in startup code, not in a running server. A server should only use asynchronous file I/O.
// synchronous version
function f1(inputDir, outputFile) {
let outputHandle = fs.openSync(outputFile, "a");
try {
let files = fs.readdirSync(inputDir, {withFileTypes: true});
for (let f of files) {
if (f.isFile()) {
let pathOfCurrentFile = path.join(inputDir, f.name);
let data = fs.readFileSync(pathOfCurrentFile, 'utf8');
fs.writeSync(outputHandle, data);
}
}
} finally {
fs.closeSync(outputHandle);
}
}
function f2(inputFile, outputFile) {
let newData = doStuffWithMy(inputFile);
fs.writeFileSync(outputFile, newData);
}
f1('myFiles', 'myNewFile.txt');
f2('myNewFile.txt', 'myNewestFile.txt');

Nodejs Read very large file(~10GB), Process line by line then write to other file

I have a 10 GB log file in a particular format, I want to process this file line by line and then write the output to other file after applying some transformations. I am using node for this operation.
Though this method is fine but it takes a hell lot of time to do this. I was able to do this within 30-45 mins in JAVA, but in node it is taking more than 160 minutes to do the same job. Following is the code:
Following is the initiation code which reads each line from the input.
var path = '../10GB_input_file.txt';
var output_file = '../output.txt';
function fileopsmain(){
fs.exists(output_file, function(exists){
if(exists) {
fs.unlink(output_file, function (err) {
if (err) throw err;
console.log('successfully deleted ' + output_file);
});
}
});
new lazy(fs.createReadStream(path, {bufferSize: 128 * 4096}))
.lines
.forEach(function(line){
var line_arr = line.toString().split(';');
perform_line_ops(line_arr, line_arr[6], line_arr[7], line_arr[10]);
}
);
}
This is the method that performs some operation over that line and
passes the input to write method to write it into the output file.
function perform_line_ops(line_arr, range_start, range_end, daynums){
var _new_lines = '';
for(var i=0; i<days; i++){
//perform some operation to modify line pass it to print
}
write_line_ops(_new_lines);
}
Following method is used to write data into a new file.
function write_line_ops(line) {
if(line != null && line != ''){
fs.appendFileSync(output_file, line);
}
}
I want to bring this time down to 15-20 mins. Is it possible to do so.
Also for the record I'm trying this on a intel i7 processor with 8 GB of RAM.
You can do this easily without a module. For example:
var fs = require('fs');
var inspect = require('util').inspect;
var buffer = '';
var rs = fs.createReadStream('foo.log');
rs.on('data', function(chunk) {
var lines = (buffer + chunk).split(/\r?\n/g);
buffer = lines.pop();
for (var i = 0; i < lines.length; ++i) {
// do something with `lines[i]`
console.log('found line: ' + inspect(lines[i]));
}
});
rs.on('end', function() {
// optionally process `buffer` here if you want to treat leftover data without
// a newline as a "line"
console.log('ended on non-empty buffer: ' + inspect(buffer));
});
I can't guess where the possible bottleneck is in your code.
Can you add the library or the source code of the lazy function?
How many operations does your perform_line_ops do? (if/else, switch/case, function calls)
I've created a example based on your given code, I know that this does not answer your question but maybe helps you understand how node handles such case.
const fs = require('fs')
const path = require('path')
const inputFile = path.resolve(__dirname, '../input_file.txt')
const outputFile = path.resolve(__dirname, '../output_file.txt')
function bootstrap() {
// fs.exists is deprecated
// check if output file exists
// https://nodejs.org/api/fs.html#fs_fs_exists_path_callback
fs.exists(outputFile, (exists) => {
if (exists) {
// output file exists, delete it
// https://nodejs.org/api/fs.html#fs_fs_unlink_path_callback
fs.unlink(outputFile, (err) => {
if (err) {
throw err
}
console.info(`successfully deleted: ${outputFile}`)
checkInputFile()
})
} else {
// output file doesn't exist, move on
checkInputFile()
}
})
}
function checkInputFile() {
// check if input file can be read
// https://nodejs.org/api/fs.html#fs_fs_access_path_mode_callback
fs.access(inputFile, fs.constants.R_OK, (err) => {
if (err) {
// file can't be read, throw error
throw err
}
// file can be read, move on
loadInputFile()
})
}
function saveToOutput() {
// create write stream
// https://nodejs.org/api/fs.html#fs_fs_createwritestream_path_options
const stream = fs.createWriteStream(outputFile, {
flags: 'w'
})
// return wrapper function which simply writes data into the stream
return (data) => {
// check if the stream is writable
if (stream.writable) {
if (data === null) {
stream.end()
} else if (data instanceof Array) {
stream.write(data.join('\n'))
} else {
stream.write(data)
}
}
}
}
function parseLine(line, respond) {
respond([line])
}
function loadInputFile() {
// create write stream
const saveOutput = saveToOutput()
// create read stream
// https://nodejs.org/api/fs.html#fs_fs_createreadstream_path_options
const stream = fs.createReadStream(inputFile, {
autoClose: true,
encoding: 'utf8',
flags: 'r'
})
let buffer = null
stream.on('data', (chunk) => {
// append the buffer to the current chunk
const lines = (buffer !== null)
? (buffer + chunk).split('\n')
: chunk.split('\n')
const lineLength = lines.length
let lineIndex = -1
// save last line for later (last line can be incomplete)
buffer = lines[lineLength - 1]
// loop trough all lines
// but don't include the last line
while (++lineIndex < lineLength - 1) {
parseLine(lines[lineIndex], saveOutput)
}
})
stream.on('end', () => {
if (buffer !== null && buffer.length > 0) {
// parse the last line
parseLine(buffer, saveOutput)
}
// Passing null signals the end of the stream (EOF)
saveOutput(null)
})
}
// kick off the parsing process
bootstrap()
I know this is old but...
At a guess appendFileSync() _write()_s to the file system and waits for the response. Lots of small writes are generally expensive, presuming you use a BufferedWriter in Java you might get faster results by skipping some write()s.
Use one of the async writes and see if node buffers sensibly, or write the lines to large node Buffer until it is full and always write a full (or nearly full) Buffer. By tuning the buffer size you could validate if the number of writes affects perf. I suspect it would.
The execution is slow, because you're not using node's asynchronous operations. In essence, you're executing the code like this:
> read some lines
> transform
> write some lines
> repeat
While you could be doing everything at once, or at least reading and writing. Some examples in the answers here do that, but the syntax is at least complicated. Using scramjet you can do it in a couple simple lines:
const {StringStream} = require('scramjet');
fs.createReadStream(path, {bufferSize: 128 * 4096})
.pipe(new StringStream({maxParallel: 128}) // I assume this is an utf-8 file
.split("\n") // split per line
.parse((line) => line.split(';')) // parse line
.map([line_arr, range_start, range_end, daynums] => {
return simplyReturnYourResultForTheOtherFileHere(
line_arr, range_start, range_end, daynums
); // run your code, return promise if you're doing some async work
})
.stringify((result) => result.toString())
.pipe(fs.createWriteStream)
.on("finish", () => console.log("done"))
.on("error", (e) => console.log("error"))
This will probably run much faster.

Block for stdin in Node.js

Short explanation:
I'm attempting to write a simple game in Node.js that needs to wait for user input every turn. How do I avoid callback hell (e.g. messy code) internal to a turn loop where each turn loop iteration needs to block and wait for input from stdin?
Long explanation:
All the explanations I have read on StackOverflow when someone asks about blocking for stdin input seem to be "that's not what Node.js is about!"
I understand that Node.js is designed to be non-blocking and I also understand why. However I feel that it has me stuck between a rock and a hard place on how to solve this. I feel like I have three options:
Find a way to block for stdin and retain my while loop
Ditch the while loop and instead recursively call a method (like nextTurn) whenever the previous turn ends.
Ditch the while loop and instead use setTimeout(0, ...) or something similar to call a method (like nextTurn) whenever a turn ends.
With option (1) I am going against Node.js principles of non-blocking IO.
With option (2) I will eventually reach a stack overflow as each call adds another turn to the call stack.
With option (3) my code ends up being a mess to follow.
Internal to Node.js there are default functions that are marked **Sync (e.g. see the fs library or the sleep function) and I'm wondering why there is no Sync method for getting user input? And if I were to write something similar to fs.readSync how would I go about doing it and still follow best practices?
Just found this:
https://www.npmjs.com/package/readline-sync
Example code (after doing an npm install readline-sync)
var readlineSync = require('readline-sync');
while(true) {
var yn = readlineSync.question("Do you like having tools that let you code how you want, rather than how their authors wanted?");
if(yn === 'y') {
console.log("Hooray!");
} else {
console.log("Back to callback world, I guess...");
process.exit();
}
}
Only problem so far is the wailing of the "That's not how node is meant to be used!" chorus, but I have earplugs :)
I agree with the comment about moving towards an event based system and would ditch the loops. I've thrown together a quick example of text based processing which can be used for simple text games.
var fs = require('fs'),
es = require('event-stream');
process.stdin
.pipe(es.split())
.on('data', parseCommand);
var actionHandlers = {};
function parseCommand(command) {
var words = command.split(' '),
action = '';
if(words.length > 1) {
action = words.shift();
}
if(actionHandlers[action]) {
actionHandlers[action](words);
} else {
invalidAction(action);
}
}
function invalidAction(action) {
console.log('Unknown Action:', action);
}
actionHandlers['move'] = function(words) {
console.log('You move', words);
}
actionHandlers['attack'] = function(words) {
console.log('You attack', words);
}
You can now break up your actions into discrete functions which you can register with a central actionHandlers variable. This makes adding new commands almost trivial. If you can add some details on why the above approach wouldn't work well for you, let me know and I'll revise the answer.
ArtHare's solution, at least for my use case, blocks background execution, including those started by a promise. While this code isn't elegant, it did block execution of the current function, until the read from stdin completed.
While this code must run from inside an async function, keep in mind that running an async function from a top-level context (directly from a script, not contained within any other function) will block that function until it completes.
Below is a full .js script demonstrating usage, tested with node v8.12.0:
const readline = require('readline');
const sleep = (waitTimeInMs) => new Promise(resolve => setTimeout(resolve, waitTimeInMs));
async function blockReadLine() {
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
let result = undefined;
rl.on('line', function(line){
result = line;
})
while(!result) await sleep(100);
return result;
}
async function run() {
new Promise(async () => {
while(true) {
console.log("Won't be silenced! Won't be censored!");
await sleep(1000);
}
});
let result = await blockReadLine();
console.log("The result was:" + result);
process.exit(0);
}
run();

Resources