NodeJS, promises, streams - processing large CSV files - node.js

I need to build a function for processing large CSV files for use in a bluebird.map() call. Given the potential sizes of the file, I'd like to use streaming.
This function should accept a stream (a CSV file) and a function (that processes the chunks from the stream) and return a promise when the file is read to end (resolved) or errors (rejected).
So, I start with:
'use strict';
var _ = require('lodash');
var promise = require('bluebird');
var csv = require('csv');
var stream = require('stream');
var pgp = require('pg-promise')({promiseLib: promise});
api.parsers.processCsvStream = function(passedStream, processor) {
var parser = csv.parse(passedStream, {trim: true});
passedStream.pipe(parser);
// use readable or data event?
parser.on('readable', function() {
// call processor, which may be async
// how do I throttle the amount of promises generated
});
var db = pgp(api.config.mailroom.fileMakerDbConfig);
return new Promise(function(resolve, reject) {
parser.on('end', resolve);
parser.on('error', reject);
});
}
Now, I have two inter-related issues:
I need to throttle the actual amount of data being processed, so as to not create memory pressures.
The function passed as the processor param is going to often be async, such as saving the contents of the file to the db via a library that is promise-based (right now: pg-promise). As such, it will create a promise in memory and move on, repeatedly.
The pg-promise library has functions to manage this, like page(), but I'm not able to wrap my ahead around how to mix stream event handlers with these promise methods. Right now, I return a promise in the handler for readable section after each read(), which means I create a huge amount of promised database operations and eventually fault out because I hit a process memory limit.
Does anyone have a working example of this that I can use as a jumping point?
UPDATE: Probably more than one way to skin the cat, but this works:
'use strict';
var _ = require('lodash');
var promise = require('bluebird');
var csv = require('csv');
var stream = require('stream');
var pgp = require('pg-promise')({promiseLib: promise});
api.parsers.processCsvStream = function(passedStream, processor) {
// some checks trimmed out for example
var db = pgp(api.config.mailroom.fileMakerDbConfig);
var parser = csv.parse(passedStream, {trim: true});
passedStream.pipe(parser);
var readDataFromStream = function(index, data, delay) {
var records = [];
var record;
do {
record = parser.read();
if(record != null)
records.push(record);
} while(record != null && (records.length < api.config.mailroom.fileParserConcurrency))
parser.pause();
if(records.length)
return records;
};
var processData = function(index, data, delay) {
console.log('processData(' + index + ') > data: ', data);
parser.resume();
};
parser.on('readable', function() {
db.task(function(tsk) {
this.page(readDataFromStream, processData);
});
});
return new Promise(function(resolve, reject) {
parser.on('end', resolve);
parser.on('error', reject);
});
}
Anyone sees a potential problem with this approach?

You might want to look at promise-streams
var ps = require('promise-streams');
passedStream
.pipe(csv.parse({trim: true}))
.pipe(ps.map({concurrent: 4}, row => processRowDataWhichMightBeAsyncAndReturnPromise(row)))
.wait().then(_ => {
console.log("All done!");
});
Works with backpressure and everything.

Find below a complete application that correctly executes the same kind of task as you want: It reads a file as a stream, parses it as a CSV and inserts each row into the database.
const fs = require('fs');
const promise = require('bluebird');
const csv = require('csv-parse');
const pgp = require('pg-promise')({promiseLib: promise});
const cn = "postgres://postgres:password#localhost:5432/test_db";
const rs = fs.createReadStream('primes.csv');
const db = pgp(cn);
function receiver(_, data) {
function source(index) {
if (index < data.length) {
// here we insert just the first column value that contains a prime number;
return this.none('insert into primes values($1)', data[index][0]);
}
}
return this.sequence(source);
}
db.task(t => {
return pgp.spex.stream.read.call(t, rs.pipe(csv()), receiver);
})
.then(data => {
console.log('DATA:', data);
}
.catch(error => {
console.log('ERROR:', error);
});
Note that the only thing I changed: using library csv-parse instead of csv, as a better alternative.
Added use of method stream.read from the spex library, which properly serves a Readable stream for use with promises.

I found a slightly better way of doing the same thing; with more control. This is a minimal skeleton with precise parallelism control. With parallel value as one all records are processed in sequence without having the entire file in memory, we can increase parallel value for faster processing.
const csv = require('csv');
const csvParser = require('csv-parser')
const fs = require('fs');
const readStream = fs.createReadStream('IN');
const writeStream = fs.createWriteStream('OUT');
const transform = csv.transform({ parallel: 1 }, (record, done) => {
asyncTask(...) // return Promise
.then(result => {
// ... do something when success
return done(null, record);
}, (err) => {
// ... do something when error
return done(null, record);
})
}
);
readStream
.pipe(csvParser())
.pipe(transform)
.pipe(csv.stringify())
.pipe(writeStream);
This allows doing an async task for each record.
To return a promise instead we can return with an empty promise, and complete it when stream finishes.
.on('end',function() {
//do something wiht csvData
console.log(csvData);
});

So to say you don't want streaming but some kind of data chunks? ;-)
Do you know https://github.com/substack/stream-handbook?
I think the simplest approach without changing your architecture would be some kind of promise pool. e.g. https://github.com/timdp/es6-promise-pool

Related

Got incomplete data on stream piping to an express response

Need to convert a DB table to a csv report.
If I immediately unload the entire tablet with one query then the application crashes because the memory runs out. I decided to query data from the table in portions of 100 rows, convert each row into a line of the report and write it into a stream that is piped with an express response.
All this happens nearly like this:
DB query
const select100Users = (maxUserCreationDateStr) => {
return db.query(`
SELECT * FROM users WHERE created_at < to_timestamp(${maxUserCreationDateStr})
ORDER BY created_at DESC LIMIT 100`);
}
stream initialisation
const { PassThrough } = require('stream');
const getUserReportStream = () => {
const stream = new PassThrough();
writeUserReport(stream).catch((e) => stream.emit('error', e));
return stream;
};
piping the stream with an express response
app.get('/report', (req, res) => {
const stream = getUserReportStream();
res.setHeader('Content-Type', 'application/vnd.ms-excel');
res.setHeader(`Content-Disposition', 'attachment; filename="${ filename }"`);
stream.pipe(res);
});
and finally how do I write data to the stream
const writeUserReport(stream) => {
let maxUserCreationDateGlobal = Math.trunc(Date.now() / 1000);
let flag = true;
stream.write(USER_REPORT_HEADER);
while (flag) {
const rows100 = await select100Users(maxUserCreationDateGlobal);
console.log(rows100.length);
if (rows100.length === 0) {
flag = false;
} else {
let maxUserCreationDate = maxUserCreationDateGlobal;
const users100 = await Promise.all(
rows100.map((r) => {
const created_at = r.created_at;
const createdAt = new Date(created_at);
if (created_at && createdAt.toString() !== 'Invalid Date') {
const createdAtNumber = Math.trunc(createdAt.valueOf() / 1000);
maxUserCreationDate = Math.min(maxUserCreationDate, createdAtNumber);
}
return mapUser(r); // returns a promise
})
);
users100.forEach((u) => stream.write(generateCsvRowFromUser(u)));
maxUserCreationDateGlobal = maxUserCreationDate;
if (rows100.length < 100) {
flag = false;
console.log('***');
}
}
}
console.log('end');
stream.end();
};
as a result I see this output in the console:
100 // 100
100 // 200
100 // 300
100 // 400
100 // 500
87 // 587
***
end
But in the downloaded file I get 401 lines (the first one with USER_REPORT_HEADER). It feels like stream.end() closes the stream before all values are read from it.
I tried using BehaviorSubject from rxjs instead of PassThrough in a similar way - the result is the same..
How can I wait for reading from the stream of all the data that I wrote there?
Or maybe someone can recommend an alternative way to solve this problem.
stream.write expects you to pass a callback as a second (or third parameter), to know when the write operation did finish. You can't call write again unless the previous write operation is finished.
So in general I'd suggest to make this whole function async and every time you call stream.write you wrap it into a Promise like
await new Promise((resolve, reject) => stream.write(data, (error) => {
if (error) {
reject(error);
return;
}
resolve();
});
Obviously it would make sense to extract this to some method.
EDIT: Additionally I don't think that's the actual problem. I assume your http connection is just timing out before all the fetching is completed, so the server will eventually close the stream once the timeout deadline is met.

AWS Lambda | Read Multiple Files from S3 | Create combined JSON | Facing performance issue

I am reading multiple JSON files from S3 on AWS lambda and finally creating one JSON after some processing, I am able to read the files and combined them but seems its not optimal way to optimize minimum IO operation, I am facing performance hit when the file size is big.
here my simplified code
exports.handler = function (e, callback) {
helper.data.readJSON(s3_param)
.then(function (data_1) {
var data_1 = JSON.parse(data_1);
helper.data.readJSON(s3_param)
.then(function (data_2) {
var data_2 = JSON.parse(data_2);
helper.data.readJSON(s3_param)
.then(function (data_3) {
var data_3 = JSON.parse(data_3);
return SomeFuntion(data_1, data_2, data_3);
});
});
});
};
this is readJSON code which read file from S3
async function readJSON(params) {
const data = (await (S3.getObject(params).promise())).Body.toString('utf-8');
return data;
}
can someone please suggest better way to do it.
I guess you could do things in parallel using Promise.all. something like
async function readJSON(params) {
const data = (await (S3.getObject(params).promise())).Body.toString('utf-8');
return JSON.parse(data);
}
exports.handler = async event => {
const [data_1, data_2, data_3] = await Promise.all([
readJSON(s3_param1),
readJSON(s3_param2),
readJSON(s3_param3)
]);
return someFunction(data_1, data_2, data_3);
};
Hope this helps

How to one-time initialize a node module with data from a local file

I am trying to create a node module that has some helper functions for searching through reference data that I have in a CSV file. I've used the csv-parser module for loading it into objects, and this API seems to be for use with an asynchronous stream reader / pipe situation. I don't want the helper functions in this module to be available to any other modules before this reference data has had a chance to load.
I've tried using a Promise, but in order to get it to work, I've had to expose that promise and the initialization function to the calling module(s), which is not ideal.
// refdata.js
const fs = require('fs');
const csv = require('csv-parser');
var theData = new Array();
function initRefdata() {
return(new Promise(function(resolve, reject) {
fs.createReadStream('refdata.csv')
.pipe(csv())
.on('data', function(data) {theData.push(data);})
.on('end', resolve);}));
}
function helperFunction1(arg) { ... }
module.exports.initRefdata = initRefdata;
module.exports.helperFunction1 = helperFunction1;
// main.js
var refdata = require('./refdata.js');
function doWork() {
console.log(refdata.helperFunction1('Italy'));
}
refdata.initRefdata().then(doWork);
This works for this one use of the reference data module, but it is frustrating that I cannot use an initialization function completely internally to refdata.js. When I do, the asynchronous call to the stream pipe is not complete before I start using the helper functions, which need all the data before they can be useful. Also, I do not want to re-load all the CSV data each time it is needed.
With the comment from #Deepal I was able to come up with:
// refdata.js
const fs = require('fs');
const csv = require('csv-parser');
var theData = new Array();
function initRefdata() {
return(new Promise(function(resolve, reject) {
fs.createReadStream('refdata.csv')
.pipe(csv())
.on('data', function(data) {theData.push(data);})
.on('end', resolve);}));
}
function helperFunction1(arg) {
if (theData.length == 0) {
initRefdata().then(nestedHelper(arg));
}
else {
nestedHelper(arg);
}
function nestedHelper(arg) { ... }
}
module.exports.helperFunction1 = helperFunction1;
// main.js
var refdata = require('./refdata.js');
function doWork() {
console.log(refdata.helperFunction1('Italy'));
}
doWork();

NodeJS - read CSV file to array returns []

I'm trying to use the promised-csv module (https://www.npmjs.com/package/promised-csv) to read the rows of a CSV file to an array of strings for a unit test:
const inputFile = '.\\test\\example_result.csv';
const CsvReader = require('promised-csv');
function readCSV(inputFile){
var reader = new CsvReader();
var output = [];
reader.on('row', function (data) {
//console.log(data);
output.push(data[0]);
});
reader.read(inputFile, output);
return output;
}
I would like to call this function later in a unit test.
it("Should store the elements of the array", async () => {
var resultSet = readCSV(inputFile);
console.log(resultSet);
});
However, resultSet yields an empty array. I am also open to use any other modules, as long as I can get an array of strings as a result.
The code should look something like this, according to the docs.
const inputFile = './test/example_result.csv';
const CsvReader = require('promised-csv');
function readCSV(inputFile) {
return new Promise((resolve, reject) => {
var reader = new CsvReader();
var output = [];
reader.on('row', data => {
// data is an array of data. You should
// concatenate it to the data set to compile it.
output = output.concat(data);
});
reader.on('done', () => {
// output will be the compiled data set.
resolve(output);
});
reader.on('error', err => reject(err));
reader.read(inputFile);
});
}
it("Should store the elements of the array", async () => {
var resultSet = await readCSV(inputFile);
console.log(resultSet);
});
readCSV() returns a Promise. There are two ways that you can access the data it returns upon completion.
As Roland Starke suggests, use async and await.
var resultSet = await readCSV(inputFile);
This will wait for the Promise to resolve before returning a value.
More here: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/await
Use Promise.prototype.then() - this is similar to async/await, but can also be chained with other promises and Promise.prototype.catch().
The most important thing to remember is that the function passed to .then() will not be executed until readCSV() has resolved.
readCSV().then((data)=>{return data}).catch((err)=>{console.log(err)})
More here: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise/then

Refactor synchronous code to unleash the power of node.js asynchronicity

We long-term Python and PHP coders have a tidy bit of synchronous code (sample below). Most of the functions have asynchronous counterparts. We really want to 'get' the power of Javascript and Node, and believe this is an ideal case of where asynchronous node.js would speed things up and blow our socks off.
What is the textbook way to refactor the following to utilize asynchronous Node? Async / await and promise.all? How? (Using Node 8.4.0. Backwards compatibility is not a concern.)
var fs = require('fs');
// This could list over 10,000 files of various size
const fileList = ['file1', 'file2', 'file3'];
const getCdate = file => fs.statSync(file).ctime; // Has async method
const getFSize = file => fs.statSync(file).size; // Has async method
// Can be async through file streams (see resources below)
const getMd5 = (file) => {
let fileData = new Buffer(0);
fileData = fs.readFileSync(file);
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
};
let filesObj = fileList.map(file => [file, {
datetime: getCdate(file),
filesize: getFSize(file),
md5hash: getMd5(file),
}]);
console.log(filesObj);
Notes:
We need to keep the functions modular and re-usable.
There are more functions getting things for filesObj than listed here
Most functions can be re-written to be async, some can not.
Ideally we need to keep the original order of fileList.
Ideally we want to use latest Node and JS features -- not rely on external modules.
Assorted file stream methods for getting md5 asynchronously:
Obtaining the hash of a file using the stream capabilities of crypto module (ie: without hash.update and hash.digest)
How to calculate md5 hash of a file using javascript
There are a variety of different ways you could handle this code asynchronously. You could use the node async library to handle all of the callbacks more elegantly. If you don't want to dive into promises then that's the "easy" option. I put easy in quotes because promises are actually easier if you understand them well enough. The async lib is helpful but it still leaves much to be desired in the way of error propagation, and there is a lot of boilerplate code you'll have to wrap all your calls in.
The better way is to use promises. Async/Await is still pretty new. Not even supported in node 7 (not sure about node 8) without a preprocessor like Bable or Typescript. Also, async/await uses promises under the hood anyway.
Here is how I would do it using promises, even included a file stats cache for maximum performance:
const fs = require('fs');
const crypto = require('crypto');
const Promise = require('bluebird');
const fileList = ['file1', 'file2', 'file3'];
// Use Bluebird's Promise.promisifyAll utility to turn all of fs'
// async functions into promise returning versions of them.
// The new promise-enabled methods will have the same name but with
// a suffix of "Async". Ex: fs.stat will be fs.statAsync.
Promise.promisifyAll(fs);
// Create a cache to store the file if we're planning to get multiple
// stats from it.
let cache = {
fileName: null,
fileStats: null
};
const getFileStats = (fileName, prop) => {
if (cache.fileName === fileName) {
return cache.fileStats[prop];
}
// Return a promise that eventually resolves to the data we're after
// but also stores fileStats in our cache for future calls.
return fs.statAsync(fileName).then(fileStats => {
cache.fileName = fileName;
cache.fileStats = fileStats;
return fileStats[prop];
})
};
const getMd5Hash = file => {
// Return a promise that eventually resolves to the hash we're after.
return fs.readFileAsync(file).then(fileData => {
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
});
};
// Create a promise that immediately resolves with our fileList array.
// Use Bluebird's Promise.map utility. Works very similar to Array.map
// except it expects all array entries to be promises that will
// eventually be resolved to the data we want.
let results = Promise.resolve(fileList).map(fileName => {
return Promise.all([
// This first gets a promise that starts resolving file stats
// asynchronously. When the promise resolves it will store file
// stats in a cache and then return the stats value we're after.
// Note that the final return is not a promise, but returning raw
// values from promise handlers implicitly does
// Promise.resolve(rawValue)
getFileStats(fileName, 'ctime'),
// This one will not return a promise. It will see cached file
// stats for our file and return the stats value from the cache
// instead. Since it's being returned into a Promise.all, it will
// be implicitly wrapped in Promise.resolve(rawValue) to fit the
// promise paradigm.
getFileStats(fileName, 'size'),
// First returns a promise that begins resolving the file data for
// our file. A promise handler in the function will then perform
// the operations we need to do on the file data in order to get
// the hash. The raw hash value is returned in the end and
// implicitly wrapped in Promise.resolve as well.
getMd5(file)
])
// .spread is a bluebird shortcut that replaces .then. If the value
// being resolved is an array (which it is because Promise.all will
// resolve an array containing the results in the same order as we
// listed the calls in the input array) then .spread will spread the
// values in that array out and pass them in as individual function
// parameters.
.spread((dateTime, fileSize, md5Hash) => [file, { dateTime, fileSize, md5Hash }]);
}).catch(error => {
// Any errors returned by any of the Async functions in this promise
// chain will be propagated here.
console.log(error);
});
Here's the code again but without comments to make it easier to look at:
const fs = require('fs');
const crypto = require('crypto');
const Promise = require('bluebird');
const fileList = ['file1', 'file2', 'file3'];
Promise.promisifyAll(fs);
let cache = {
fileName: null,
fileStats: null
};
const getFileStats = (fileName, prop) => {
if (cache.fileName === fileName) {
return cache.fileStats[prop];
}
return fs.statAsync(fileName).then(fileStats => {
cache.fileName = fileName;
cache.fileStats = fileStats;
return fileStats[prop];
})
};
const getMd5Hash = file => {
return fs.readFileAsync(file).then(fileData => {
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
});
};
let results = Promise.resolve(fileList).map(fileName => {
return Promise.all([
getFileStats(fileName, 'ctime'),
getFileStats(fileName, 'size'),
getMd5(file)
]).spread((dateTime, fileSize, md5Hash) => [file, { dateTime, fileSize, md5Hash }]);
}).catch(console.log);
In the end results will be an array like which should hopefully match the results of your original code but should perform much better in a benchmark:
[
['file1', { dateTime: 'data here', fileSize: 'data here', md5Hash: 'data here' }],
['file2', { dateTime: 'data here', fileSize: 'data here', md5Hash: 'data here' }],
['file3', { dateTime: 'data here', fileSize: 'data here', md5Hash: 'data here' }]
]
Apologies in advance for any typos. Didn't have the time or ability to actually run any of this. I looked over it quite extensively though.
After discovering that async/await is in node as of 7.6 I decided to play with it a bit last night. It seems most useful for recursive async tasks that don't need to be done in parallel, or for nested async tasks that you might wish you could write synchronously. For what you needed here there isn't any mind-blowing way to use async/await that I can see but there are a few places where the code would read more cleanly. Here's the code again but with a few little async/await conveniences.
const fs = require('fs');
const crypto = require('crypto');
const Promise = require('bluebird');
const fileList = ['file1', 'file2', 'file3'];
Promise.promisifyAll(fs);
let cache = {
fileName: null,
fileStats: null
};
async function getFileStats (fileName, prop) {
if (cache.fileName === fileName) {
return cache.fileStats[prop];
}
let fileStats = await fs.stat(fileName);
cache.fileName = fileName;
cache.fileStats = fileStats;
return fileStats[prop];
};
async function getMd5Hash (file) {
let fileData = await fs.readFileAsync(file);
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
};
let results = Promise.resolve(fileList).map(fileName => {
return Promise.all([
getFileStats(fileName, 'ctime'),
getFileStats(fileName, 'size'),
getMd5(file)
]).spread((dateTime, fileSize, md5Hash) => [file, { dateTime, fileSize, md5Hash }]);
}).catch(console.log);
I would make getCDate, getFSize, and getMd5 all asynchronous and promisified then wrap them in another asynchronous promise-returning function, here called statFile.
function statFile(file) {
return Promise.all([
getCDate(file),
getFSize(file),
getMd5(file)
]).then((datetime, filesize, md5hash) => ({datetime, filesize, md5hash}))
.catch(/*handle error*/);
}
Then you could change your mapping function to
const promises = fileList.map(statFile);
Then it's simple to use Promise.all:
Promise.all(promises)
.then(filesObj => /*do something*/)
.catch(err => /*handle error*/)
This leaves things modular, doesn't require async/await, allows you to plug extra functions into statFile, and preserves your file order.

Resources