What is the "reactive" way to read file line-by-line - node.js

I'm learning reactive programming using RxJS and encounter a case when I need to read a file line-by-line. Actually I solved it using a solution likes:
https://gist.github.com/yvele/447555b1c5060952a279
It works, but I need to use some normal JS code to transform the stream of Buffers to stream of lines. (use "readline" module in example above)
I wonder if there are other ways to transform an Observable of Buffer to Observable of line, using RxJS operators, likes example below.
var Rx = require('rx');
var fs = require('fs');
var lines = Rx.Observable
.fromEvent(rl, 'data') // emits buffers overtime
// some transforms ...
.subscribe(
(line) => console.log(line), // emit string line by line
err => console.log("Error: %s", err),
() => console.log("Completed")
);

You can probably achieve something pretty close to what you want with scan and concatMap.
Something like:
bufferSource
.concat(Rx.Observable.of("\n")) // parens was missing // to make sure we don't miss the last line!
.scan(({ buffer }, b) => {
const splitted = buffer.concat(b).split("\n");
const rest = splitted.pop();
return { buffer: rest, items: splitted };
}, { buffer: "", items: [] })
// Each item here is a pair { buffer: string, items: string[] }
// such that buffer contains the remaining input text that has no newline
// and items contains the lines that have been produced by the last buffer
.concatMap(({ items }) => items)
// we flatten this into a sequence of items (strings)
.subscribe(
item => console.log(item),
err => console.log(err),
() => console.log("Done with this buffer source"),
);

You can use following class
'use strict'
const lineReader = require('line-reader');
const Rx = require('rxjs');
const RxOp = require('rxjs/operators');
class CSVReader {
constructor(filepath {
this.filepath = filepath;
}
readByLines()
{
const source = new Rx.Subject();
lineReader.open(this.filepath, (err, reader)=> {
Rx.of(0).pipe(
RxOp.expand(val => {
reader.nextLine((err2, line) => source.next(line));
return Rx.of(1 + val);
}),
RxOp.takeWhile(_=> {
let has = reader.hasNextLine();
if(!has) source.complete();
return has;
})
).subscribe(_=>_);
})
return source;
}
}
module.exports = CSVReader
and use it as follows
const { bufferCount } = require('rxjs/operators');
let reader = new CSVReader('path/to/file');
reader.readByLines()
.pipe(bufferCount(2)) // chunk size
.subscribe(chunk=> {
console.log({chunk});
});

I would say like this:
const readline = require('readline');
const fs = require('fs');
const path = require('path');
const {fromEvent, race, Observable} = require('rxjs');
const {tap, takeUntil, take, map} = require('rxjs/operators');
const rl = readline.createInterface({
input: fs.createReadStream(path.resolve('./', 'myfile'))
});
let obs = new Observable(observer=>{
rl.on('line', val => observer.next(val)),
rl.on('error', err => observer.error(err)),
rl.on('close', complete => observer.complete(complete))
})
.pipe(tap(line=>console.log(`line: ${line}`)))
obs.subscribe(()=>{},
(e)=>console.log(`Error reading file: ${e}`),
()=>console.log("Read complete"))
An alternative for creating the observable could be:
let obs = fromEvent(rl, 'line')
.pipe(
takeUntil(race(
fromEvent(rl, 'close').pipe(take(1)) ,
fromEvent(rl, 'error').pipe(map((err)=>{throw err}))
)))
Ideally, rxjs could have provided an operator like: fromEvent(emitter, nextEvent, errorEvent, completeEvent ) to help keep the above code even simpler.

I tried a bunch of the above answers and built my own ugly version. Then, I poked around the code on GitHub and found that RxJS handles stream like objects - there's no point in mucking around with events. Just pass a ReadStream to from and it tests it for ReadableStreamLike and then turns it into an AsyncGenerator.
import * as readline from 'node:readline';
import { from } from 'rxjs';
const file = fs.createReadStream(fileName);
const line = readline.createInterface({ input: file });
const line$ = from(line).subscribe({
next: (dat) => { ... },
error: (err) => { ... },
complete: () => { ... }
});

Related

Stream (Geo)JSON file and get startByte and endByte of each JSON record in the file

For very large JSON/GeoJSON files, I'd like to create a primitive key/value store that keeps track of the starting positions and lengths of each JSON record in the file. This way, I could look up individual records at a later stage without reading the whole file into memory (Using the fd.read API). Somewhat similar to a super simple database, but read-only and without the extra overhead.
The issue I'm facing is that I don't know how I could determine the starting position and byte length of each JSON record / GeoJSON feature in the original file.
Here's some pseudo-code showcasing what I'm trying to achieve, note that the geojsonStream.parse callback doesn't receive the startByte and length arguments in reality though.
Thanks for your help, also happy about any feedback outlining why this might be a bad idea :)
import geojsonStream from 'geojson-stream'
import { open } from 'fs/promises'
import { Buffer } from 'buffer'
function getFeaturePositionsInFile(fd) {
return new Promise((resolve,reject) => {
const featurePositionsInFile = []
const stream = fd
.createReadStream()
.pipe(geojsonStream.parse((building, index, startByte, length) => {
// The startByte and length callback arguments are not real unfortunately :(
featurePositionsInFile.push({
index,
startPosition,
length
})
}))
stream.on('end', () => resolve(featurePositionsInFile))
stream.on('error', () => reject)
})
}
function readSingleFeatureFromFile(fd, startPosition, length) {
return new Promise((resolve, reject) => {
try {
const buff = Buffer.alloc(length)
const offset = 0
const { buffer } = await fd.read(buff, offset, length, startPosition)
const singleFeature = JSON.parse(buffer.toString())
resolve(singleFeature)
} catch (e) {
reject(e)
}
})
}
const fd = await open('buildings.geojson')
const featurePositionsInFile = await getFeaturePositionsInFile(fd)
const featureIndexToRead = 0
const { startPosition, length } = featurePositionsInFile[featureIndexToRead]
const singleFeature = await readSingleFeatureFromFile(fd, startPosition, length)
Alright, since I couldn't find a suitable package for my needs, I created a simple (naïve) solution using RegExp to extract single GeoJSON features.
It works given:
The GeoJSON has a properties object, and the object is the last key in the parent GeoJSON object
the GeoJSON (properties) solely consists ASCII characters
For GeoJSON files containing non-ASCII characters, the byte counting is off. I tried but couldn't really find out what exactly I'm doing wrong, so any help is appreciated!
For a more general solution, I guess one would need to implement the byte counting logic in an existing library such as stream-json
import { open } from 'fs/promises'
import { Buffer } from 'buffer'
const HIGHWATERMARK = 64 * 1024 / 8
function getFeaturePositionsInFile(fd) {
return new Promise((resolve,reject) => {
const featurePositionsInFile = []
const stream = fd.createReadStream({highWaterMark: HIGHWATERMARK, autoClose: false});
// this RegEx will solely work with standard GeoJSON without any foreign members:
// https://datatracker.ietf.org/doc/html/rfc7946#section-6.1
// The properties object has to be present, and has to be that last key in the GeoJSON object
const jsonExtractor = /\{[\n\r\s]*?"type":[\n\r\s]*?"Feature"[\S\s]*?\}(?:[\n\r\s]*\})+/g
let string = ''
let endPos = 0
stream.on('data', (d) => {
const section = d.toString()
const sectionLength = (new TextEncoder().encode(section)).length
string += section
endPos+= sectionLength
let match
let latestEndPositionInString = 0
while ((match = jsonExtractor.exec(string)) != null) {
const startPositionInString = match.index
const featureString = match[0]
const endPositionInString = startPositionInString + featureString.length
const curStringLength = (new TextEncoder().encode(string)).length
// calculate starting position in file
const startPosition = endPos - curStringLength + startPositionInString
// calculate number of bytes in feature
const byteLength = (new TextEncoder().encode(featureString)).length
// store info for later in our lookup array
featurePositionsInFile.push({
startPosition,
byteLength
})
if (endPositionInString > latestEndPositionInString) {
latestEndPositionInString = endPositionInString
}
}
// remove features from string to free memory
string = string.substring(latestEndPositionInString)
})
stream.on('end', () => resolve(featurePositionsInFile))
stream.on('error', () => reject)
})
}
function readSingleFeatureFromFile(fd, startPosition, length) {
return new Promise(async (resolve, reject) => {
try {
const buff = Buffer.alloc(length)
const offset = 0
const { buffer } = await fd.read(buff, offset, length, startPosition)
const featureString = buffer.toString()
const singleFeature = JSON.parse(featureString)
resolve(singleFeature)
} catch (e) {
reject(e)
}
})
}
async function getFeature(featureIndexToRead, featurePositionsInFile) {
const { startPosition, byteLength } = featurePositionsInFile[featureIndexToRead]
const singleFeature = await readSingleFeatureFromFile(fd, startPosition, byteLength)
return singleFeature
}
// source: https://raw.githubusercontent.com/node-geojson/geojson-stream/master/test/data/featurecollection.geojson
const path = 'featurecollection.geojson'
// -> has 3 features
const fd = await open(path, 'r');
const featurePositionsInFile = await getFeaturePositionsInFile(fd)
// get nth (e.g 3rd) feature in file
const firstFeature = await getFeature(2, featurePositionsInFile)
console.log(firstFeature)
// done! make sure to close the filehandle
fd.close()
https://gist.github.com/chrispahm/c226cca151b25147869288600151a5f8

NodeJS: How to read and modify buffer data, before parsing it from file?

In NodeJS, I am having a log file, where my logs are in the format:
{"time":"2021-09-23T11:36:18.076Z","type":"info","message":"some message","data":{"id":123}},
{"time":"2021-09-23T11:36:18.076Z","type":"info","message":"some message","data":{"id":123}},
{"time":"2021-09-23T11:36:18.076Z","type":"info","message":"some message","data":{"id":123}},
These are basically objects, separated with comma. What I need to do is to read the content of this file and to convert the logs into an array of objects (that I can manipulate later).
I am trying with something like:
let fileLogs = "./data/myfile.log";
fs.readFile(fileLogs, (err, fileLogsContent) => {
if (err) {
console.log("cannot read log file");
return;
}
//I know I need to manipulate the fileLogsContent here, before doing JSON.parse
let logsContent = { ...JSON.parse(fileLogsContent) };
//do something here with the array of objects 'logsContent'
});
Since the content in the log file is not in a format that can be parsed, the above JSON.parse fails. My idea is to bring the logfile in the following format:
[
{"time":"2021-09-23T11:36:18.076Z","type":"info","message":"some message","data":{"id":123}},
{"time":"2021-09-23T11:36:18.076Z","type":"info","message":"some message","data":{"id":123}},
{"time":"2021-09-23T11:36:18.076Z","type":"info","message":"some message","data":{"id":123}}
]
Which means that on the fly I would need to prepend [ as a first character and to replace the last , with ]. I am not aware how can I do that, since the fileLogsContent is actually a buffer. So how can I read the content and do the manipulations I mentioned, in order to be able to parse it later and to bring it into array of objects format?
You can simply wrap each line in a string and then call JSON.parse on it after removing the trailing comma. Here's an example (note that it still needs error-handling ofc):
const fs = require('fs');
const readline = require('readline');
const readInterface = readline.createInterface({
input: fs.createReadStream('./input.txt'),
output: undefined,
console: false
});
(async () => {
const resultArray = await new Promise((resolve, reject) => {
const chunks = [];
readInterface.on('line', (line) => {
line = line.substr(0, line.lastIndexOf(','))
chunks.push(JSON.parse(`${line}`));
})
readInterface.on('close', () => {
resolve(chunks);
})
});
console.log(resultArray);
})();
Here is the working solution that I came up with, following the directions from the #eol's answer.
const { once } = require('events');
const { createReadStream } = require('fs');
const { createInterface } = require('readline');
(async function processLineByLine() {
try {
const rl = createInterface({
input: createReadStream('./data/myfile.log'),
crlfDelay: Infinity
});
const chunks = [];
rl.on('line', (line) => {
// Process the line.
chunks.push(JSON.parse(`${line.substr(0, line.lastIndexOf(','))}`));
});
await once(rl, 'close');
console.log('File processed. Content = ', chunks);
} catch (err) {
console.log("cannot read log file, err = ", err);
}
})();

How to use readline in NodeJs on an event (wait stream close)

I'm creating a bot that when it receives a message begins to read a text file and responds to the message with the contents of the file.
Unfortunately, I can't get out of this asynchronous hell and I only get errors, undefined or promise
The last experiment was this:
const fs = require('fs');
const readline = require('readline');
// bot.listen("message").reply(responseText())
function readFile(file) {
var text = '';
var readInterface = readline.createInterface({
input: fs.createReadStream(file),
terminal: false
});
readInterface.on('line', function(line) {
linea = line.trim();
console.log(linea);
text += linea;
}).on('close', function() {
return text;
});
});
}
async function responseText() {
var content = await readFile("file.txt");
content.then(function(data) {
return data;
})
}
What I would like to get then is delay the response until I get the contents of the file.
I know that node is based on async but I can't figure out how to handle it!
Thanks all
If you want to use async-await need to create a promise and return it.
function readFile(file) {
return new Promise((res, rej) => {
try {
var text = '';
var readInterface = readline.createInterface({
input: fs.createReadStream(file),
terminal: false
});
readInterface
.on('line', function (line) {
linea = line.trim();
text += linea;
})
.on('close', function () {
res(text);
});
} catch(err){
rej(err)
}
});
}
If your using express.js or any framework built on top of it, you can simply pipe the readstream to the response since express's responses are streams to begin with:
const es = require('event-stream')
...
let getFileStream = path => (
fs.createReadStream(path)
.pipe(es.split())
.pipe(es.map(function (data, cb) {
cb(null
, inspect(JSON.parse(data)))
}))
);
router.get('/message', function (req, res, next) {
let file$ = getFileStream(yourFilePath)
file$.on('error', next).pipe(res)
})
If you need to transform the file content, you can use a transform stream or as shown in the example above, a synchronous event-stream mapping. The idea is to always play around with the file content at stream level to avoid having to load the entire file content in memory.
You don't really want to buffer the whole file content in memory. It can quickly become a problem with huge files on a busy day. what you need is to pipe the file stream directly to the browser. Same principle applies for any kind of consumer.
Of course, if the mechanism is all internal, you should only pass the file path along or the actual stream until you need to actually open the file and do something with the content. In this case, you go back to your stream toolbox, whether it be the native node.js stream API implementation, the event-stream package or some kind of observable library like rxjs.
I had a similar issue in an app that watches a directory for new files, reads the file(s) and returns derived data based on the file content. My Reader function is based on this async example from the nodejs docs. I return options, which contains the context, only after the file is read completely.
const { createReadStream } = require('fs')
const { createInterface } = require('readline')
const { once } = require('events')
// Reader.js
async function Reader (options) {
let { file, cb } = options
let fileStream = createReadStream(file)
const readInterface = createInterface({
input: fileStream,
crlfDelay: Infinity
})
readInterface.on('line', (line) => {
cb(line)
})
await once(readInterface, 'close')
return options
}
module.exports = Reader
I then have a file which imports my Reader and defines how to use it. I define a callback function to pass to the line event listener. I bind the callback to the options object that I pass to my Reader function. In the
readFile function I make sure to return the call to Reader, which is a Promise.
/**
* #desc callback to instruct what to do with each line read
*
* #param {*} line
*/
const readFileLine = function (line) {
linea = line.trim();
console.log(linea);
text += linea;
this.context += linea
}
/**
* #desc once the added file is ready to be processed read file line by line
* #listens {Event} listens for `process` event
*/
const readFile = (options) => {
return Reader(options)
}
/**
* #desc Call the file reader and do what you need with the reponse
*
*/
const getResponseFromFiles = (file) => {
const opts = {}
opts.cb = readFileLine.bind(opts)
opts.context = ''
opts.file = file
readFile(opts)
.then(data => {
process.exitCode = 0
console.log(data)
return data
})
.catch(err => {
process.exitCode = 1
console.log(err.message)
})
}

Nodejs - read line by line from file, perform async action for each line and resume

I'm trying to read a file line by line, perform some action that has a callback and when the function finishes to resume line reading. For example:
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream('./phrases.txt');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
rl.on('line', function (line) {
rl.pause();
setTimeout(function () {
console.log('resuming');
rl.resume();
}, 2000);
});
I was under the impression the example above should basically read a line, wait for 2 seconds, console.log and then continue to the next line. What really happens is that it waits for the first 2 seconds and then spews out lots of console.log
Line by Line module helps you reading large text files, line by line, without buffering the files into memory.
You can process the lines asynchronously. This is the example provided:
var LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
Solution without installing any external library. You only need the native node.js "readline" module. Just do the following:
import fs from "fs";
import readline from "readline";
const readInterface = readline.createInterface({
input: fs.createReadStream(path.join(__dirname, 'myfile.txt'))
});
for await (const line of readInterface){
await someAsynchronousOperation();
}
Source (Official documentation): https://nodejs.org/api/readline.html#rlsymbolasynciterator
A very nice line-reader module exists,
https://github.com/nickewing/line-reader
simple code:
var lineReader = require('line-reader');
lineReader.eachLine('file.txt', function(line, last) {
// do whatever you want with line...
console.log(line);
if(last){
// or check if it's the last one
}
});
also "java-style" interface for more control:
lineReader.open('file.txt', function(reader) {
if (reader.hasNextLine()) {
reader.nextLine(function(line) {
console.log(line);
});
}
});
Another cool solution:
var fs = require('fs'),
sleep = require('sleep'),
readline = require('readline');
var rd = readline.createInterface({
input: fs.createReadStream('phrases.txt'),
output: process.stdout,
terminal: false
});
rd.on('line', function(line) {
console.log('-------')
console.log(line);
sleep.sleep(2)
});
function createLineReader(fileName){
var EM = require("events").EventEmitter
var ev = new EM()
var stream = require("fs").createReadStream(fileName)
var remainder = null;
stream.on("data",function(data){
if(remainder != null){//append newly received data chunk
var tmp = new Buffer(remainder.length+data.length)
remainder.copy(tmp)
data.copy(tmp,remainder.length)
data = tmp;
}
var start = 0;
for(var i=0; i<data.length; i++){
if(data[i] == 10){ //\n new line
var line = data.slice(start,i)
ev.emit("line", line)
start = i+1;
}
}
if(start<data.length){
remainder = data.slice(start);
}else{
remainder = null;
}
})
stream.on("end",function(){
if(null!=remainder) ev.emit("line",remainder)
})
return ev
}
//---------main---------------
fileName = process.argv[2]
lineReader = createLineReader(fileName)
lineReader.on("line",function(line){
console.log(line.toString())
//console.log("++++++++++++++++++++")
})
Here is a simple solution in typescript using line-reader that can run in nodejs 8:
import lineReader from 'line-reader';
function readLines(filename: string, processLine: (line: string) => Promise<void>): Promise<void> {
return new Promise((resolve, reject) => {
lineReader.eachLine(filename, (line, last, callback) => {
if (!callback) throw new Error('panic');
processLine(line)
.then(() => last ? resolve() : callback())
.catch(reject);
});
});
}
async function echo(): Promise<void> {
await readLines('/dev/stdin', async (line) => {
console.log(line);
});
}
echo();
Note that it does not buffer the whole file before executing, therefore it is suitable for processing large text files.
I suggest to use stdio for this kind of things, as input stream is paused and resumed automatically and you don't need to worry about your system resources. You'll be able to read really huge files with just a few MBs of memory:
This example prints a line every 2 seconds:
$ node myprogram.js < file.txt
import { read } from 'stdio';
async function onLine (line) {
console.log(line);
await sleep(2000);
}
read(onLine)
.then(() => console.log('finished'));
Note I'm using an asynchronous sleep to represent any asynchronous task. It is not included in Node.js by default but it would be as follows:
const sleep = (delay) => new Promise((resolve) => setTimeout(resolve, delay));
const readline = require('readline');
const fs = require('fs');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt')
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
source: https://nodejs.org/api/readline.html#readline_example_read_file_stream_line_by_line

Parsing huge logfiles in Node.js - read in line-by-line

I need to do some parsing of large (5-10 Gb)logfiles in Javascript/Node.js (I'm using Cube).
The logline looks something like:
10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state "SUCCESS".
We need to read each line, do some parsing (e.g. strip out 5, 7 and SUCCESS), then pump this data into Cube (https://github.com/square/cube) using their JS client.
Firstly, what is the canonical way in Node to read in a file, line by line?
It seems to be fairly common question online:
http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
Read a file one line at a time in node.js?
A lot of the answers seem to point to a bunch of third-party modules:
https://github.com/nickewing/line-reader
https://github.com/jahewson/node-byline
https://github.com/pkrumins/node-lazy
https://github.com/Gagle/Node-BufferedReader
However, this seems like a fairly basic task - surely, there's a simple way within the stdlib to read in a textfile, line-by-line?
Secondly, I then need to process each line (e.g. convert the timestamp into a Date object, and extract useful fields).
What's the best way to do this, maximising throughput? Is there some way that won't block on either reading in each line, or on sending it to Cube?
Thirdly - I'm guessing using string splits, and the JS equivalent of contains (IndexOf != -1?) will be a lot faster than regexes? Has anybody had much experience in parsing massive amounts of text data in Node.js?
I searched for a solution to parse very large files (gbs) line by line using a stream. All the third-party libraries and examples did not suit my needs since they processed the files not line by line (like 1 , 2 , 3 , 4 ..) or read the entire file to memory
The following solution can parse very large files, line by line using stream & pipe. For testing I used a 2.1 gb file with 17.000.000 records. Ram usage did not exceed 60 mb.
First, install the event-stream package:
npm install event-stream
Then:
var fs = require('fs')
, es = require('event-stream');
var lineNr = 0;
var s = fs.createReadStream('very-large-file.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
lineNr += 1;
// process line here and call s.resume() when rdy
// function below was for logging memory usage
logMemoryUsage(lineNr);
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(err){
console.log('Error while reading file.', err);
})
.on('end', function(){
console.log('Read entire file.')
})
);
Please let me know how it goes!
You can use the inbuilt readline package, see docs here. I use stream to create a new output stream.
var fs = require('fs'),
readline = require('readline'),
stream = require('stream');
var instream = fs.createReadStream('/path/to/file');
var outstream = new stream;
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function(line) {
console.log(line);
//Do your stuff ...
//Then write to output stream
rl.write(line);
});
Large files will take some time to process. Do tell if it works.
I really liked #gerard answer which is actually deserves to be the correct answer here. I made some improvements:
Code is in a class (modular)
Parsing is included
Ability to resume is given to the outside in case there is an asynchronous job is chained to reading the CSV like inserting to DB, or a HTTP request
Reading in chunks/batche sizes that
user can declare. I took care of encoding in the stream too, in case
you have files in different encoding.
Here's the code:
'use strict'
const fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
parse = require("csv-parse"),
iconv = require('iconv-lite');
class CSVReader {
constructor(filename, batchSize, columns) {
this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8'))
this.batchSize = batchSize || 1000
this.lineNumber = 0
this.data = []
this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true}
}
read(callback) {
this.reader
.pipe(es.split())
.pipe(es.mapSync(line => {
++this.lineNumber
parse(line, this.parseOptions, (err, d) => {
this.data.push(d[0])
})
if (this.lineNumber % this.batchSize === 0) {
callback(this.data)
}
})
.on('error', function(){
console.log('Error while reading file.')
})
.on('end', function(){
console.log('Read entirefile.')
}))
}
continue () {
this.data = []
this.reader.resume()
}
}
module.exports = CSVReader
So basically, here is how you will use it:
let reader = CSVReader('path_to_file.csv')
reader.read(() => reader.continue())
I tested this with a 35GB CSV file and it worked for me and that's why I chose to build it on #gerard's answer, feedbacks are welcomed.
I used https://www.npmjs.com/package/line-by-line for reading more than 1 000 000 lines from a text file. In this case, an occupied capacity of RAM was about 50-60 megabyte.
const LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
The Node.js Documentation offers a very elegant example using the Readline module.
Example: Read File Stream Line-by-Line
const { once } = require('node:events');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt'),
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
await once(rl, 'close');
Note: we use the crlfDelay option to recognize all instances of CR LF ('\r\n') as a single line break.
Apart from read the big file line by line, you also can read it chunk by chunk. For more refer to this article
var offset = 0;
var chunkSize = 2048;
var chunkBuffer = new Buffer(chunkSize);
var fp = fs.openSync('filepath', 'r');
var bytesRead = 0;
while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) {
offset += bytesRead;
var str = chunkBuffer.slice(0, bytesRead).toString();
var arr = str.split('\n');
if(bytesRead = chunkSize) {
// the last item of the arr may be not a full line, leave it to the next chunk
offset -= arr.pop().length;
}
lines.push(arr);
}
console.log(lines);
I had the same problem yet. After comparing several modules that seem to have this feature, I decided to do it myself, it's simpler than I thought.
gist: https://gist.github.com/deemstone/8279565
var fetchBlock = lineByline(filepath, onEnd);
fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No.
It cover the file opened in a closure, that fetchBlock() returned will fetch a block from the file, end split to array (will deal the segment from last fetch).
I've set the block size to 1024 for each read operation. This may have bugs, but code logic is obvious, try it yourself.
Reading / Writing files using stream with the native nodejs modules (fs, readline):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('input.json'),
output: fs.createWriteStream('output.json')
});
rl.on('line', function(line) {
console.log(line);
// Do any 'line' processing if you want and then write to the output file
this.output.write(`${line}\n`);
});
rl.on('close', function() {
console.log(`Created "${this.output.path}"`);
});
Based on this questions answer I implemented a class you can use to read a file synchronously line-by-line with fs.readSync(). You can make this "pause" and "resume" by using a Q promise (jQuery seems to require a DOM so cant run it with nodejs):
var fs = require('fs');
var Q = require('q');
var lr = new LineReader(filenameToLoad);
lr.open();
var promise;
workOnLine = function () {
var line = lr.readNextLine();
promise = complexLineTransformation(line).then(
function() {console.log('ok');workOnLine();},
function() {console.log('error');}
);
}
workOnLine();
complexLineTransformation = function (line) {
var deferred = Q.defer();
// ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error));
return deferred.promise;
}
function LineReader (filename) {
this.moreLinesAvailable = true;
this.fd = undefined;
this.bufferSize = 1024*1024;
this.buffer = new Buffer(this.bufferSize);
this.leftOver = '';
this.read = undefined;
this.idxStart = undefined;
this.idx = undefined;
this.lineNumber = 0;
this._bundleOfLines = [];
this.open = function() {
this.fd = fs.openSync(filename, 'r');
};
this.readNextLine = function () {
if (this._bundleOfLines.length === 0) {
this._readNextBundleOfLines();
}
this.lineNumber++;
var lineToReturn = this._bundleOfLines[0];
this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany)
return lineToReturn;
};
this.getLineNumber = function() {
return this.lineNumber;
};
this._readNextBundleOfLines = function() {
var line = "";
while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file
this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver
this.idxStart = 0
while ((this.idx = this.leftOver.indexOf("\n", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver
line = this.leftOver.substring(this.idxStart, this.idx);
this._bundleOfLines.push(line);
this.idxStart = this.idx + 1;
}
this.leftOver = this.leftOver.substring(this.idxStart);
if (line !== "") {
break;
}
}
};
}
node-byline uses streams, so i would prefer that one for your huge files.
for your date-conversions i would use moment.js.
for maximising your throughput you could think about using a software-cluster. there are some nice-modules which wrap the node-native cluster-module quite well. i like cluster-master from isaacs. e.g. you could create a cluster of x workers which all compute a file.
for benchmarking splits vs regexes use benchmark.js. i havent tested it until now. benchmark.js is available as a node-module
import * as csv from 'fast-csv';
import * as fs from 'fs';
interface Row {
[s: string]: string;
}
type RowCallBack = (data: Row, index: number) => object;
export class CSVReader {
protected file: string;
protected csvOptions = {
delimiter: ',',
headers: true,
ignoreEmpty: true,
trim: true
};
constructor(file: string, csvOptions = {}) {
if (!fs.existsSync(file)) {
throw new Error(`File ${file} not found.`);
}
this.file = file;
this.csvOptions = Object.assign({}, this.csvOptions, csvOptions);
}
public read(callback: RowCallBack): Promise < Array < object >> {
return new Promise < Array < object >> (resolve => {
const readStream = fs.createReadStream(this.file);
const results: Array < any > = [];
let index = 0;
const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => {
index++;
results.push(await callback(data, index));
}).on('error', (err: Error) => {
console.error(err.message);
throw err;
}).on('end', () => {
resolve(results);
});
readStream.pipe(csvStream);
});
}
}
import { CSVReader } from '../src/helpers/CSVReader';
(async () => {
const reader = new CSVReader('./database/migrations/csv/users.csv');
const users = await reader.read(async data => {
return {
username: data.username,
name: data.name,
email: data.email,
cellPhone: data.cell_phone,
homePhone: data.home_phone,
roleId: data.role_id,
description: data.description,
state: data.state,
};
});
console.log(users);
})();
I have made a node module to read large file asynchronously text or JSON.
Tested on large files.
var fs = require('fs')
, util = require('util')
, stream = require('stream')
, es = require('event-stream');
module.exports = FileReader;
function FileReader(){
}
FileReader.prototype.read = function(pathToFile, callback){
var returnTxt = '';
var s = fs.createReadStream(pathToFile)
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
//console.log('reading line: '+line);
returnTxt += line;
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(){
console.log('Error while reading file.');
})
.on('end', function(){
console.log('Read entire file.');
callback(returnTxt);
})
);
};
FileReader.prototype.readJSON = function(pathToFile, callback){
try{
this.read(pathToFile, function(txt){callback(JSON.parse(txt));});
}
catch(err){
throw new Error('json file is not valid! '+err.stack);
}
};
Just save the file as file-reader.js, and use it like this:
var FileReader = require('./file-reader');
var fileReader = new FileReader();
fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/});

Resources