What I am doing is using fs to stitch 5 pieces of html-page parts(html, head, chead, topaside, main, footer) together. The file name is htmlpage.js, so you can just run node htmlpage.js file1 file2 file3 ... in command line tool, and it will stitch those html-page parts together, then spit out file1.html, file2.html, file3.html .... I don't like to use template engine/library/framework or whatever, especially when I am learning.
Here is the sorce code:
'use strict';
const fs = require('fs'),
head = fs.createReadStream('./html-parts/head.html', 'utf8'),
topaside = fs.createReadStream('./html-parts/topaside.html', 'utf8'),
footer = fs.createReadStream('./html-parts/footer.html', 'utf8');
let name = process.argv.slice(2),
htmlray = [],
ni = 0,
nl = name.length;
for (ni; ni < nl; ni ++) {
let cheadP = './html-parts/' + name[ni] + '-head.html',
mainP = './html-parts/' + name[ni] + '-main.html',
htmlP = name[ni] + '.html',
chead = fs.createReadStream(cheadP, 'utf8'),
main = fs.createReadStream(mainP, 'utf8'),
html = fs.createWriteStream(htmlP, 'utf8');
//let those parts form an array
htmlray = [html, head, chead, topaside, main, footer];
openendPipe(htmlray[1], htmlray[0]);
htmlray[1].on('end', () => {
openendPipe(htmlray[2], htmlray[0]);
htmlray[2].on('end', () => {
openendPipe(htmlray[3], htmlray[0]);
htmlray[3].on('end', () => {
openendPipe(htmlray[4], htmlray[0]);
htmlray[4].on('end', () => {
htmlray[5].pipe(htmlray[0]);
htmlray[5].on('end', () => {
console.log(name + '.html' + ' created');
});
});
});
});
});
}
function openendPipe(src, dst) {
return src.pipe(dst, {end: false});
}
But what if the htmlray has 100 parts, I want to be able to do an iteration to replace these code, let's call it pipeblock:
openendPipe(htmlray[1], htmlray[0]);
htmlray[1].on('end', () => {
openendPipe(htmlray[2], htmlray[0]);
htmlray[2].on('end', () => {
openendPipe(htmlray[3], htmlray[0]);
htmlray[3].on('end', () => {
openendPipe(htmlray[4], htmlray[0]);
htmlray[4].on('end', () => {
htmlray[5].pipe(htmlray[0]);
htmlray[5].on('end', () => {
console.log(name + '.html' + ' created');
});
});
});
});
});
I tried these solutions, they didn't work:
Solution 1:
(function () {
let i = 0, count = 1;
function nextpipe() {
let arr = arguments[0];
i ++;
if (count > 5) return;
openendPipe(arr[i], arr[0]);
count ++;
arr[i].on('end', nextpipe);
}
return nextpipe;
})();
//then replace 'pipeblock' with 'nextpipe(htmlray)';
//console.log: nextpipe is undefined.
Solution 2:
//replace 'pipeblock' with these code
let pi = 1,
pl = htmlray.length - 1;
htmlray[pi].pipe(htmlray[0], {end: false});
htmlray[pi].on('end', nextpipe);
function nextpipe() {
if (pi > pl) return console.log(name + '.html' + ' created');;
pi ++;
htmlray[pi].pipe(htmlray[0], {end: false});
htmlray[pi].on('end', nextpipe);
}
//cosole.log:
//htmlray[pi].pipe(htmlray[0], {end: false});
//TypeError: Cannot read property 'pipe' of undefined
This thing calls "callback hell" and you should use either some library to handle async calls like async.js or (better) use promises.
Just simply promisify fs.readFile like this (or write your own promise for fs.createReadStream it you want to use it)
const readFile = Promise.promisify(require('fs').readFile);
and then combine your request promises using Promise.all()
Here are examples http://bluebirdjs.com/docs/api/promise.promisify.html, http://bluebirdjs.com/docs/api/promise.all.html for fs for Bluebird promise library.
While I am reading through async and promise documentation, I get to the part about parallel, series and waterfall. My problem is about creating an html page, so it can't be done in parallel. But the documentation starts and talks a lot about parallel, and they just add confusion to my head. To grasp all of them is probably going to take a long time. Anyway, while I am experimenting, I come up with an easy solution of my own.
In my problem, the repetitive part is about check if previous html-part has done writing by using readingStream.on('end', <do next writing>);, so I make it into a function:
function ifend(src, src2, dst) {
return src.on('end', () => {return openendPipe(src2, dst);});
}
Then I can turn pipeblock into:
openendPipe(htmlray[0], html);
ifend(htmlray[0], htmlray[1], html);
ifend(htmlray[1], htmlray[2], html);
ifend(htmlray[2], htmlray[3], html);
ifend(htmlray[3], htmlray[4], html);
Then I can do an iteration in a function:
function createHtml(src, dst) {
let l = src.length - 1,
i = 0;
openendPipe(src[0], dst);
for (i; i < l; i ++) {
//iterate ifend function.
ifend(src[i], src[i + 1], dst);
}
return dst;
}
Now I can replace pipeblock with this:
createHtml(htmlray, html);
Related
Creating a very simple Node.js utility to process each record separately in a text file (line by line), but it is surprisingly difficult to handle the following scenario due to the inherent async world of Node:
Open connection to database
Read each line of a text file
Based on conditions within the processed text of the line, look up a record in the database
Upon completion of reading the text file, close the
database connection
The challenge I face is that the text file is read in line-by-line (using the 'readline' module), attaching a listener to the 'line' event emitted by the module. The lines of the file are all processed rapidly and the queries to the database are queued up. I have tried many approaches to essentially create a synchronous process to no avail. Here is my latest attempt that is definitely full of async/await functions. Being a longtime developer but new to Node.js I know I am missing something simple. Any guidance will be greatly appreciated.
const { Pool, Client } = require('pg')
const client = new Client({
user: '*****',
host: '****',
database: '*****',
password: '******#',
port: 5432,
})
client.connect()
.then(() => {
console.log("Connected");
console.log("Processing file");
const fs = require('fs');
const readline = require('readline');
const instream = fs.createReadStream("input.txt");
const outstream = new (require('stream'))();
const rl = readline.createInterface(instream, outstream);
rl.on('line', async function (line) {
var callResult;
if (line.length > 0) {
var words = line.replace(/[^0-9a-z ]/gi, '').split(" ");
var len = words.length;
for (var i = 0; i < words.length; i++) {
if (words[i].length === 0) {
words.splice(i, 1);
i--;
} else {
words[i] = words[i].toLowerCase();
}
}
for (var i = 0; i < words.length; i++) {
if (i <= words.length - 3) {
callResult = await isKeyPhrase(words[i].trim() + " " + words[i + 1].trim() + " " + words[i + 2].trim());
if (!callResult) {
callResult = await isKeyPhrase(words[i].trim() + " " + words[i + 1].trim());
if (!callResult) {
callResult = await isKeyPhrase(words[i].trim());
}
};
} else if (i <= words.length - 2) {
callResult = await isKeyPhrase(words[i].trim() + " " + words[i + 1].trim());
if (!callResult ) {
callResult = await isKeyPhrase(words[i].trim());
};
} else if (i < words.length) {
callResult = await isKeyPhrase(words[i].trim());
}
}
} // (line.length > 0)
});
rl.on('close', function (line) {
console.log('done reading file.');
// stubbed out because queries are still running
//client.end();
});
}).catch( (err) => {
console.error('connection error', err.stack);
});
async function isKeyPhrase(keyPhraseText) {
var callResult = false;
return new Promise(async function(resolve, reject) {
const query = {
name: 'get-name',
text: 'select KP.EntryID from KeyPhrase KP where (KP.KeyPhraseText = $1) and (Active = true)',
values: [keyPhraseText],
rowMode: 'array'
}
// promise
await client.query(query)
.then(result => {
if (result.rowCount == 1) {
console.log(`Key phrase '${keyPhraseText}' found in table with Phase ID = ${result.rows}`);
calResult = true;
}
}).catch(e => {
console.error(e.stack)
console.log(e.stack);
reject(e);
});
resolve(callResult);
});
}
welcome to StackOverflow. :)
Indeed there's no (sensible) way to read a file synchronously while trying to interact the data per-line with a database. There's no feasible way if the file is bigger than probably 1/8th of your memory.
This doesn't mean however there's no way or writing a sane code for this. The only problem is that standard node streams (including readline) do not wait for async code.
I'd recommend using scramjet, a functional stream programming framework, pretty much designed for you use case (disclamer: I'm the author). Here's how the code would look like:
const { Pool, Client } = require('pg')
const { StringStream } = require("scramjet");
const client = new Client({
user: '*****',
host: '****',
database: '*****',
password: '******#',
port: 5432,
})
client.connect()
.then(async () => {
console.log("Connected, processing file");
return StringStream
// this creates a "scramjet" stream from input.
.from(fs.createReadStream("input.txt"))
// this splits fs line by line
.lines()
// the next line is just to show when the file is fully read
.use(stream => stream.whenEnd.then(() => console.log("done reading file.")))
// this splits the words like the first "for" loop in your code
.map(line => line.toLowerCase().replace(/[^0-9a-z ]+/g, '').split(" "))
// this one gets rid of empty lines (i.e. no words)
.filter(line => line.length > 0)
// this splits the words like the first "for" loop in your code
.map(async words => {
for (var i = 0; i < words.length; i++) {
const callResult = await isKeyPhrase(words.slice(i, i + 3).join(" "));
if (callResult) return callResult;
}
})
// this runs the above list of operations to the end and returns a promise.
.run();
})
.then(() => {
console.log("done processing file.");
client.end();
})
.catch((e) => {
console.error(e.stack);
});
async function isKeyPhrase(keyPhraseText) {
const query = {
name: 'get-name',
text: 'select KP.EntryID from KeyPhrase KP where (KP.KeyPhraseText = $1) and (Active = true)',
values: [keyPhraseText],
rowMode: 'array'
};
const result = await client.query(query);
if (result.rowCount > 0) {
console.log(`Key phrase '${keyPhraseText}' found in table with Phase ID = ${result.rows}`);
return true;
}
return false;
}
I compacted and optimized your code in some places, but in general this should get you what you want - scramjet adds the asynchronous mode for each operation and will wait until all the operations are ended.
I am wondering if there is a simple way to get "synchronous" readline or at least get the appearance of synchronous I/O in node.js
I use something like this but it is quite awkward
var readline = require('readline');
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
var i = 0;
var s1 = '';
var s2 = '';
rl.on('line', function(line){
if(i==0) { s1 = line; }
else if(i==1) { s2 = line; }
i++;
})
rl.on('close', function() {
//do something with lines
})'
Instead of this I would prefer if it were as easy as something like
var s1 = getline(); // or "await getline()?"
var s2 = getline(); // or "await getline()?"
Helpful conditions:
(a) Prefer not using external modules or /dev/stdio filehandle, I am submitting code to a code submission website and these do not work there
(b) Can use async/await or generators
(c) Should be line based
(d) Should not require reading entire stdin into memory before processing
Just in case someone stumbles upon here in future
Node 11.7 added support for this using async await
const readline = require('readline');
//const fileStream = fs.createReadStream('input.txt');
const rl = readline.createInterface({
input: process.stdin, //or fileStream
output: process.stdout
});
for await (const line of rl) {
console.log(line)
}
Remember to wrap it in async function(){} otherwise you will get a reserved_keyword_error.
const start = async () =>{
for await (const line of rl) {
console.log(line)
}
}
start()
To read an individual line, you can use the async iterator manually
const it = rl[Symbol.asyncIterator]();
const line1 = await it.next();
You can just wrap it in a promise -
const answer = await new Promise(resolve => {
rl.question("What is your name? ", resolve)
})
console.log(answer)
Like readline module, there is another module called readline-sync, which takes synchronous input.
Example:
const reader = require("readline-sync"); //npm install readline-sync
let username = reader.question("Username: ");
const password = reader.question("Password: ",{ hideEchoBack: true });
if (username == "admin" && password == "foobar") {
console.log("Welcome!")
}
I think this is what you want :
const readline = require('readline');
const rl = readline.createInterface({ input: process.stdin , output: process.stdout });
const getLine = (function () {
const getLineGen = (async function* () {
for await (const line of rl) {
yield line;
}
})();
return async () => ((await getLineGen.next()).value);
})();
const main = async () => {
let a = Number(await getLine());
let b = Number(await getLine());
console.log(a+b);
process.exit(0);
};
main();
Note: this answer use experimental features and need Node v11.7
Try this. It's still not a perfect replication of a synchronous line reading function -- e.g. async functions still happen later, so some of your calling code may execute out of order, and you can't call it from inside a normal for loop -- but it's a lot easier to read than the typical .on or .question code.
// standard 'readline' boilerplate
const readline = require('readline');
const readlineInterface = readline.createInterface({
input: process.stdin,
output: process.stdout
});
// new function that promises to ask a question and
// resolve to its answer
function ask(questionText) {
return new Promise((resolve, reject) => {
readlineInterface.question(questionText, (input) => resolve(input) );
});
}
// launch your program since `await` only works inside `async` functions
start()
// use promise-based `ask` function to ask several questions
// in a row and assign each answer to a variable
async function start() {
console.log()
let name = await ask("what is your name? ")
let quest = await ask("what is your quest? ")
let color = await ask("what is your favorite color? ")
console.log("Hello " + name + "! " +
"Good luck with " + quest +
"and here is a " + color + " flower for you.");
process.exit()
}
UPDATE: https://www.npmjs.com/package/readline-promise implements it (source code here: https://github.com/bhoriuchi/readline-promise/blob/master/src/index.js#L192 ). It implements several other features as well, but they seem useful too, and not too overengineered, unlike some other NPM packages that purport to do the same thing. Unfortunately, I can't get it to work due to https://github.com/bhoriuchi/readline-promise/issues/5 but I like its implementation of the central function:
function ask(questionText) {
return new Promise((resolve, reject) => {
readlineInterface.question(questionText, resolve);
});
}
Using generators your example would look like this:
var readline = require('readline');
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
var i = 0;
var s1 = '';
var s2 = '';
var iter=(function* () {
s1 = yield;
i++;
s2 = yield;
i++;
while (true) {
yield;
i++;
}
})(); iter.next();
rl.on('line', line=>iter.next(line))
rl.on('close', function() {
//do something with lines
})
So yield here acts as if it were a blocking getline() and you can handle lines in the usual sequential fashion.
UPD:
And an async/await version might look like the following:
var readline = require('readline');
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
var i = 0;
var s1 = '';
var s2 = '';
var continuation;
var getline = (() => {
var thenable = {
then: resolve => {
continuation = resolve;
}
};
return ()=>thenable;
})();
(async function() {
s1 = await getline();
i++;
s2 = await getline();
i++;
while (true) {
await getline();
i++;
}
})();
rl.on('line', line=>continuation(line))
rl.on('close', function() {
//do something with lines
})
In both of these "synchronous" versions, i is not used for distinguishing lines and only useful for counting the total number of them.
Here's an example but it requires reading entire stdin before giving results however which is not ideal
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
function lineiterator() {
var currLine = 0;
var lines = [];
return new Promise(function(resolve, reject) {
rl.on('line', function (line){
lines.push(line)
})
rl.on('close', function () {
resolve({
next: function() {
return currLine < lines.length ? lines[currLine++]: null;
}
});
})
})
}
Example
lineiterator().then(function(x) {
console.log(x.next())
console.log(x.next())
})
$ echo test$\ntest | node test.js
test
test
The simplest (and preferred) option is available in the docs. https://nodejs.org/api/readline.html#rlquestionquery-options-callback
const util = require('util');
const question = util.promisify(rl.question).bind(rl);
async function questionExample() {
try {
const answer = await question('What is you favorite food? ');
console.log(`Oh, so your favorite food is ${answer}`);
} catch (err) {
console.error('Question rejected', err);
}
}
questionExample();
Since I don't know how many strings you need I put them all in an Array
Don't hesitate to comment if you need a more detailed answer or if my answer is not exact :
var readline = require('readline');
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
var i = 0;
var strings = [];
rl.on('line', function(line) {
// 2 lines below are in case you want to stop the interface after 10 lines
// if (i == 9)
// rl.close()
strings[i] = line
i++
}).on('close', function() {
console.log(strings)
})
// this is in case you want to stop the program when you type ctrl + C
process.on('SIGINT', function() {
rl.close()
})
We can use promise and process.stdin events together to simulate a synchronous input system
const { EOL } = require("os");
const getLine = async () =>
(
await new Promise((resolve) => {
process.stdin.on("data", (line) => {
resolve("" + line);
});
})
).split(EOL)[0];
const line = await getLine();
console.log(line);
I'm looking for a simple way to perform synchronous http-requests in node.js, but it's still getting async responses ...
I've realised that node.js is recommended to async jobs, but in my case,
I need the synchronous response to call other functions that use this data, if it's null/undefined, I can't proceed with the flow...
What's the better way to do that?
Here's my code:
function callCellId(data) {
console.log("Data: " + data);
var towers = [],
rscp = [];
var request = require('sync-request');
for (var x = 0; x < data.length; x++) {
console.log("Request data: \n");
rscp[x] = data[x].rscp;
var res = request('POST', 'http://opencellid.org/cell/get?key=xxxxx&mcc=' + data[x].mcc + '&mnc=' + data[x].mnc + '&lac=' + data[x].LAC + '&cellid=' + data[x].cellID + '&format=json');
console.log("loop " + x);
data = res.getBody().toString();
console.log("rsp: " + data);
towers[x] = {
'latitude': data.lat,
'longitude': data.lon,
'rscp': rscp[x],
'signal': data.averageSignalStrength
};
}
console.log("Content for triangulation" + JSON.stringify(towers));
return towers;
}
Using async in a loop cloud be tricky.
I solved this without external libraries using generators:
LoopOperation: function() {
//define generator with the main loop
var loopIterator = function*() {
for (var index = 0; index < 10; index++) {
var result = yield asyncOperation( (res) => loopIterator.next(res)); //do something asyc and execute () => loopIterator.next() when done as callback
console.log(result);
}
}();
loopIterator.next(); //start loop
}
Since the nodejs nature is async, every time we need some sync call (like this nested request stack), we're able to use promises
"A Promise is an object representing the eventual completion or failure of an asynchronous operation"
reference
I.E:
const request = require('request-promise');
function callCellId(data) {
let towers = [];
let options = {
url: 'http://opencellid.org/cell/get',
method: 'POST',
json: true
};
data.forEach(location => {
options.body = {
key: 'YOUR_PRIVATE_KEY',
mcc: location.mcc,
mnc: location.mnc,
lac: location.lac,
cellId: location.cellID
}
request(options).then(cellInfo => {
towers.push({
latitude: cellInfo.lat,
longitude: cellInfo.lon,
rscp: location.rscp,
signal: cellInfo.averageSignalStrength
});
}).catch(err => {
console.log('Could not get cellId Info for',location);
console.log(err);
});
});
return towers;
}
I'm trying to read a file line by line, perform some action that has a callback and when the function finishes to resume line reading. For example:
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream('./phrases.txt');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
rl.on('line', function (line) {
rl.pause();
setTimeout(function () {
console.log('resuming');
rl.resume();
}, 2000);
});
I was under the impression the example above should basically read a line, wait for 2 seconds, console.log and then continue to the next line. What really happens is that it waits for the first 2 seconds and then spews out lots of console.log
Line by Line module helps you reading large text files, line by line, without buffering the files into memory.
You can process the lines asynchronously. This is the example provided:
var LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
Solution without installing any external library. You only need the native node.js "readline" module. Just do the following:
import fs from "fs";
import readline from "readline";
const readInterface = readline.createInterface({
input: fs.createReadStream(path.join(__dirname, 'myfile.txt'))
});
for await (const line of readInterface){
await someAsynchronousOperation();
}
Source (Official documentation): https://nodejs.org/api/readline.html#rlsymbolasynciterator
A very nice line-reader module exists,
https://github.com/nickewing/line-reader
simple code:
var lineReader = require('line-reader');
lineReader.eachLine('file.txt', function(line, last) {
// do whatever you want with line...
console.log(line);
if(last){
// or check if it's the last one
}
});
also "java-style" interface for more control:
lineReader.open('file.txt', function(reader) {
if (reader.hasNextLine()) {
reader.nextLine(function(line) {
console.log(line);
});
}
});
Another cool solution:
var fs = require('fs'),
sleep = require('sleep'),
readline = require('readline');
var rd = readline.createInterface({
input: fs.createReadStream('phrases.txt'),
output: process.stdout,
terminal: false
});
rd.on('line', function(line) {
console.log('-------')
console.log(line);
sleep.sleep(2)
});
function createLineReader(fileName){
var EM = require("events").EventEmitter
var ev = new EM()
var stream = require("fs").createReadStream(fileName)
var remainder = null;
stream.on("data",function(data){
if(remainder != null){//append newly received data chunk
var tmp = new Buffer(remainder.length+data.length)
remainder.copy(tmp)
data.copy(tmp,remainder.length)
data = tmp;
}
var start = 0;
for(var i=0; i<data.length; i++){
if(data[i] == 10){ //\n new line
var line = data.slice(start,i)
ev.emit("line", line)
start = i+1;
}
}
if(start<data.length){
remainder = data.slice(start);
}else{
remainder = null;
}
})
stream.on("end",function(){
if(null!=remainder) ev.emit("line",remainder)
})
return ev
}
//---------main---------------
fileName = process.argv[2]
lineReader = createLineReader(fileName)
lineReader.on("line",function(line){
console.log(line.toString())
//console.log("++++++++++++++++++++")
})
Here is a simple solution in typescript using line-reader that can run in nodejs 8:
import lineReader from 'line-reader';
function readLines(filename: string, processLine: (line: string) => Promise<void>): Promise<void> {
return new Promise((resolve, reject) => {
lineReader.eachLine(filename, (line, last, callback) => {
if (!callback) throw new Error('panic');
processLine(line)
.then(() => last ? resolve() : callback())
.catch(reject);
});
});
}
async function echo(): Promise<void> {
await readLines('/dev/stdin', async (line) => {
console.log(line);
});
}
echo();
Note that it does not buffer the whole file before executing, therefore it is suitable for processing large text files.
I suggest to use stdio for this kind of things, as input stream is paused and resumed automatically and you don't need to worry about your system resources. You'll be able to read really huge files with just a few MBs of memory:
This example prints a line every 2 seconds:
$ node myprogram.js < file.txt
import { read } from 'stdio';
async function onLine (line) {
console.log(line);
await sleep(2000);
}
read(onLine)
.then(() => console.log('finished'));
Note I'm using an asynchronous sleep to represent any asynchronous task. It is not included in Node.js by default but it would be as follows:
const sleep = (delay) => new Promise((resolve) => setTimeout(resolve, delay));
const readline = require('readline');
const fs = require('fs');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt')
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
source: https://nodejs.org/api/readline.html#readline_example_read_file_stream_line_by_line
I need to do some parsing of large (5-10 Gb)logfiles in Javascript/Node.js (I'm using Cube).
The logline looks something like:
10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state "SUCCESS".
We need to read each line, do some parsing (e.g. strip out 5, 7 and SUCCESS), then pump this data into Cube (https://github.com/square/cube) using their JS client.
Firstly, what is the canonical way in Node to read in a file, line by line?
It seems to be fairly common question online:
http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
Read a file one line at a time in node.js?
A lot of the answers seem to point to a bunch of third-party modules:
https://github.com/nickewing/line-reader
https://github.com/jahewson/node-byline
https://github.com/pkrumins/node-lazy
https://github.com/Gagle/Node-BufferedReader
However, this seems like a fairly basic task - surely, there's a simple way within the stdlib to read in a textfile, line-by-line?
Secondly, I then need to process each line (e.g. convert the timestamp into a Date object, and extract useful fields).
What's the best way to do this, maximising throughput? Is there some way that won't block on either reading in each line, or on sending it to Cube?
Thirdly - I'm guessing using string splits, and the JS equivalent of contains (IndexOf != -1?) will be a lot faster than regexes? Has anybody had much experience in parsing massive amounts of text data in Node.js?
I searched for a solution to parse very large files (gbs) line by line using a stream. All the third-party libraries and examples did not suit my needs since they processed the files not line by line (like 1 , 2 , 3 , 4 ..) or read the entire file to memory
The following solution can parse very large files, line by line using stream & pipe. For testing I used a 2.1 gb file with 17.000.000 records. Ram usage did not exceed 60 mb.
First, install the event-stream package:
npm install event-stream
Then:
var fs = require('fs')
, es = require('event-stream');
var lineNr = 0;
var s = fs.createReadStream('very-large-file.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
lineNr += 1;
// process line here and call s.resume() when rdy
// function below was for logging memory usage
logMemoryUsage(lineNr);
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(err){
console.log('Error while reading file.', err);
})
.on('end', function(){
console.log('Read entire file.')
})
);
Please let me know how it goes!
You can use the inbuilt readline package, see docs here. I use stream to create a new output stream.
var fs = require('fs'),
readline = require('readline'),
stream = require('stream');
var instream = fs.createReadStream('/path/to/file');
var outstream = new stream;
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function(line) {
console.log(line);
//Do your stuff ...
//Then write to output stream
rl.write(line);
});
Large files will take some time to process. Do tell if it works.
I really liked #gerard answer which is actually deserves to be the correct answer here. I made some improvements:
Code is in a class (modular)
Parsing is included
Ability to resume is given to the outside in case there is an asynchronous job is chained to reading the CSV like inserting to DB, or a HTTP request
Reading in chunks/batche sizes that
user can declare. I took care of encoding in the stream too, in case
you have files in different encoding.
Here's the code:
'use strict'
const fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
parse = require("csv-parse"),
iconv = require('iconv-lite');
class CSVReader {
constructor(filename, batchSize, columns) {
this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8'))
this.batchSize = batchSize || 1000
this.lineNumber = 0
this.data = []
this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true}
}
read(callback) {
this.reader
.pipe(es.split())
.pipe(es.mapSync(line => {
++this.lineNumber
parse(line, this.parseOptions, (err, d) => {
this.data.push(d[0])
})
if (this.lineNumber % this.batchSize === 0) {
callback(this.data)
}
})
.on('error', function(){
console.log('Error while reading file.')
})
.on('end', function(){
console.log('Read entirefile.')
}))
}
continue () {
this.data = []
this.reader.resume()
}
}
module.exports = CSVReader
So basically, here is how you will use it:
let reader = CSVReader('path_to_file.csv')
reader.read(() => reader.continue())
I tested this with a 35GB CSV file and it worked for me and that's why I chose to build it on #gerard's answer, feedbacks are welcomed.
I used https://www.npmjs.com/package/line-by-line for reading more than 1 000 000 lines from a text file. In this case, an occupied capacity of RAM was about 50-60 megabyte.
const LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
The Node.js Documentation offers a very elegant example using the Readline module.
Example: Read File Stream Line-by-Line
const { once } = require('node:events');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt'),
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
await once(rl, 'close');
Note: we use the crlfDelay option to recognize all instances of CR LF ('\r\n') as a single line break.
Apart from read the big file line by line, you also can read it chunk by chunk. For more refer to this article
var offset = 0;
var chunkSize = 2048;
var chunkBuffer = new Buffer(chunkSize);
var fp = fs.openSync('filepath', 'r');
var bytesRead = 0;
while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) {
offset += bytesRead;
var str = chunkBuffer.slice(0, bytesRead).toString();
var arr = str.split('\n');
if(bytesRead = chunkSize) {
// the last item of the arr may be not a full line, leave it to the next chunk
offset -= arr.pop().length;
}
lines.push(arr);
}
console.log(lines);
I had the same problem yet. After comparing several modules that seem to have this feature, I decided to do it myself, it's simpler than I thought.
gist: https://gist.github.com/deemstone/8279565
var fetchBlock = lineByline(filepath, onEnd);
fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No.
It cover the file opened in a closure, that fetchBlock() returned will fetch a block from the file, end split to array (will deal the segment from last fetch).
I've set the block size to 1024 for each read operation. This may have bugs, but code logic is obvious, try it yourself.
Reading / Writing files using stream with the native nodejs modules (fs, readline):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('input.json'),
output: fs.createWriteStream('output.json')
});
rl.on('line', function(line) {
console.log(line);
// Do any 'line' processing if you want and then write to the output file
this.output.write(`${line}\n`);
});
rl.on('close', function() {
console.log(`Created "${this.output.path}"`);
});
Based on this questions answer I implemented a class you can use to read a file synchronously line-by-line with fs.readSync(). You can make this "pause" and "resume" by using a Q promise (jQuery seems to require a DOM so cant run it with nodejs):
var fs = require('fs');
var Q = require('q');
var lr = new LineReader(filenameToLoad);
lr.open();
var promise;
workOnLine = function () {
var line = lr.readNextLine();
promise = complexLineTransformation(line).then(
function() {console.log('ok');workOnLine();},
function() {console.log('error');}
);
}
workOnLine();
complexLineTransformation = function (line) {
var deferred = Q.defer();
// ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error));
return deferred.promise;
}
function LineReader (filename) {
this.moreLinesAvailable = true;
this.fd = undefined;
this.bufferSize = 1024*1024;
this.buffer = new Buffer(this.bufferSize);
this.leftOver = '';
this.read = undefined;
this.idxStart = undefined;
this.idx = undefined;
this.lineNumber = 0;
this._bundleOfLines = [];
this.open = function() {
this.fd = fs.openSync(filename, 'r');
};
this.readNextLine = function () {
if (this._bundleOfLines.length === 0) {
this._readNextBundleOfLines();
}
this.lineNumber++;
var lineToReturn = this._bundleOfLines[0];
this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany)
return lineToReturn;
};
this.getLineNumber = function() {
return this.lineNumber;
};
this._readNextBundleOfLines = function() {
var line = "";
while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file
this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver
this.idxStart = 0
while ((this.idx = this.leftOver.indexOf("\n", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver
line = this.leftOver.substring(this.idxStart, this.idx);
this._bundleOfLines.push(line);
this.idxStart = this.idx + 1;
}
this.leftOver = this.leftOver.substring(this.idxStart);
if (line !== "") {
break;
}
}
};
}
node-byline uses streams, so i would prefer that one for your huge files.
for your date-conversions i would use moment.js.
for maximising your throughput you could think about using a software-cluster. there are some nice-modules which wrap the node-native cluster-module quite well. i like cluster-master from isaacs. e.g. you could create a cluster of x workers which all compute a file.
for benchmarking splits vs regexes use benchmark.js. i havent tested it until now. benchmark.js is available as a node-module
import * as csv from 'fast-csv';
import * as fs from 'fs';
interface Row {
[s: string]: string;
}
type RowCallBack = (data: Row, index: number) => object;
export class CSVReader {
protected file: string;
protected csvOptions = {
delimiter: ',',
headers: true,
ignoreEmpty: true,
trim: true
};
constructor(file: string, csvOptions = {}) {
if (!fs.existsSync(file)) {
throw new Error(`File ${file} not found.`);
}
this.file = file;
this.csvOptions = Object.assign({}, this.csvOptions, csvOptions);
}
public read(callback: RowCallBack): Promise < Array < object >> {
return new Promise < Array < object >> (resolve => {
const readStream = fs.createReadStream(this.file);
const results: Array < any > = [];
let index = 0;
const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => {
index++;
results.push(await callback(data, index));
}).on('error', (err: Error) => {
console.error(err.message);
throw err;
}).on('end', () => {
resolve(results);
});
readStream.pipe(csvStream);
});
}
}
import { CSVReader } from '../src/helpers/CSVReader';
(async () => {
const reader = new CSVReader('./database/migrations/csv/users.csv');
const users = await reader.read(async data => {
return {
username: data.username,
name: data.name,
email: data.email,
cellPhone: data.cell_phone,
homePhone: data.home_phone,
roleId: data.role_id,
description: data.description,
state: data.state,
};
});
console.log(users);
})();
I have made a node module to read large file asynchronously text or JSON.
Tested on large files.
var fs = require('fs')
, util = require('util')
, stream = require('stream')
, es = require('event-stream');
module.exports = FileReader;
function FileReader(){
}
FileReader.prototype.read = function(pathToFile, callback){
var returnTxt = '';
var s = fs.createReadStream(pathToFile)
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
//console.log('reading line: '+line);
returnTxt += line;
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(){
console.log('Error while reading file.');
})
.on('end', function(){
console.log('Read entire file.');
callback(returnTxt);
})
);
};
FileReader.prototype.readJSON = function(pathToFile, callback){
try{
this.read(pathToFile, function(txt){callback(JSON.parse(txt));});
}
catch(err){
throw new Error('json file is not valid! '+err.stack);
}
};
Just save the file as file-reader.js, and use it like this:
var FileReader = require('./file-reader');
var fileReader = new FileReader();
fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/});