NodeJS copyFile then unlink in callback - node.js

I have a large amount of files (millions) that I want to copy from a folder to an other one, but then based on some parsing options, I will need to delete the file if it doesn't respect some criteria.
I'm doing the parsing on my local copy of the file because it would be slower on the network to parse (read file + parse) than doing it locally.
My code looks like this :
for (let file of files) {
fs.copyFile(from, to, err => {
if (err) return;
parse(file);
});
}
The parse function is something like :
parse = file => {
fs.readFile(file, (err, data) => {
//do some parsing
if (notOk) {
fs.unlink(file);
};
}
}
The problem is that it's doing all the copyFile and doesn't seem to execute the callback with the unlink, and I really need to unlink as soon as the file is finished copying since I can't afford the disk space to copy all files first.
Do I need to use the Sync version of those methods or something else?
Thanks
I would have expected an output like this :
copyFile a
copyFile b
copyFile c
parsing a
copyFile d
unlink a
parsing b
copyFile e
...
but instead I have all copyFile and none of the parsing/unlink happening.

If files contains millions of files, you're starting millions of concurrent copy actions, which is probably overwhelming your system.
Instead, consider using something like async.eachLimit to limit the number of concurrent copy actions.
Untested example:
const async = require('async');
async.eachLimit(files, 10, (file, callback) => {
fs.copyFile(from, to, err => {
if (err) return callback(); // or pass `err`
parse(file, callback);
});
});
function parse(file, callback) {
fs.readFile(file, (err, data) => {
//do some parsing
if (notOk) {
fs.unlink(file, callback);
} else {
return callback();
}
})
}

I think this is happening, the callback in the coping is finishing so the thread is finished, but the functions can not finish, you can make this sync and is going to work fine and be more clear, but other way is to transform the functions to promises so you can return the promise and wait for them.
You could write something like this.
const util = require('util');
const fs = require('fs');
const files = ['1.txt', '2.txt', '3.txt', '4.txt'];
//promisify all the fs functions to use.
const copyFilePromise = util.promisify(fs.copyFile);
const readFilePromise = util.promisify(fs.readFile);
const unlinkPromise = util.promisify(fs.unlink);
//make a parser function that return a promise.
const parse = (file) => {
return readFilePromise(file)
.then(() => {
//do some parsing
if(notOk)
return unlinkPromise(file)
});
};
//store all the promises in an array
let promises = []
//iterate for all the files using the promisify aproach of the functions
for (let file of files){
//add all the promises to an array of promises.
promises.push(copyFilePromise(from, to).then(parse(file)).catch(console.error));
}
//wait for all the promises to resolve.
Promise.all(promises);

Related

Is fs_promises_readdir an async function?

Basically, what I'am trying to this is to read all the files names from a directory and I found this fsPromises.readdir which does pretty much what I want :
Returns with an array of the names of the files in the directory.
I have taken this code from nodeJS documentation page , here :
https://nodejs.org/api/fs.html#fs_fspromises_readdir_path_options
import { readdir } from 'fs/promises'; // I changed this to : const readdir = require('fs/promises');
try {
const files = await readdir(path);
for await (const file of files)
console.log(file);
} catch (err) {
console.error(err);
}
When I run this , the console will give me the following error :
const files = await readdir(path);
^^^^^
SyntaxError: await is only valid in async function
This is weird since according to the documentation this function is a Promise and if it is a promise that means it is async. Right ?
So , I dont know what am I missing ?
The error you're seeing refers to the fact that you're using the await keyword outside of an async function.
I think this is allowed in recent versions of Node, but if you're running an older version you will need to wrap your code in an async function.
(async function() {
try {
const files = await readdir(path);
for await (const file of files)
console.log(file);
} catch (err) {
console.error(err);
})()
}()

How to make fs.readFile async await?

I have this nodejs code here which read a folder and process the file. The code works. But it is still printing all the file name first, then only read the file. How do I get a file and then read a content of the file first and not getting all the files first?
async function readingDirectory(directory) {
try {
fileNames = await fs.readdir(directory);
fileNames.map(file => {
const absolutePath = path.resolve(folder, file);
log(absolutePath);
fs.readFile(absolutePath, (err, data) => {
log(data); // How to make it async await here?
});
});
} catch {
console.log('Directory Reading Error');
}
}
readingDirectory(folder);
To use await, you need to use promise versions of fs.readFile() and fs.readdir() which you can get on fs.promises and if you want these to run sequentially, then use a for loop instead of .map():
async function readingDirectory(directory) {
const fileNames = await fs.promises.readdir(directory);
for (let file of fileNames) {
const absolutePath = path.join(directory, file);
log(absolutePath);
const data = await fs.promises.readFile(absolutePath);
log(data);
}
}
readingDirectory(folder).then(() => {
log("all done");
}).catch(err => {
log(err);
});

NodeJS return not stopping the entire function

I have the function, enter, that contains two fs.readFile functions inside and on the innermost readfile I'm checking to see whether a txt file contains a certain keyword, and if it does it should stop the entire enter function.
Her is what the enter function looks like:
async function enter(email, firstName, lastName){
fs.readFile(fileName, function(err, data){
parsedData = JSON.parse(data);
email = parsedData.email;
fs.readFile('./anotherfile.txt', function (err, data) {
if (err) throw err;
if(data.includes(email)){
console.log('Stopping function');
return;
}
});
});
console.log('Continuing with function');
}
The problem is that when anotherfile.txt contains the keyword, it does not stop the entire function, it continues and logs, "Continuing with function" as seen in the code above.
Any help would be appreciated!
fs promises are available Node v11.0.0
or You can can convert like this const readFile = util.promisify(fs.readFile);
const fsp = require('fs').promises;
async function enter(email, firstName, lastName) {
try {
let data = await fsp.readFile(fileName)
let parsedData = JSON.parse(data);
let email = parsedData.email;
data = await fsp.readFile('./anotherfile.txt')
if (data.includes(email)) {
console.log('Stopping function');
return;
}
console.log('Continuing with function');
} catch (err) {
throw err
}
}
This is because of two things.
You are using asynchronous File read, that is, the flow of code won't stop when this readFile is called. Instead, the program would keep executing in a normal fashion. And when the file read operation is completed, the callback function you supplied will be called with the corresponding error or data.
The return statement is inside of the callback function, hence it will only affect that function.
You need to use await when you are dealing with asynchronous functions. Look that up here

Refactor synchronous code to unleash the power of node.js asynchronicity

We long-term Python and PHP coders have a tidy bit of synchronous code (sample below). Most of the functions have asynchronous counterparts. We really want to 'get' the power of Javascript and Node, and believe this is an ideal case of where asynchronous node.js would speed things up and blow our socks off.
What is the textbook way to refactor the following to utilize asynchronous Node? Async / await and promise.all? How? (Using Node 8.4.0. Backwards compatibility is not a concern.)
var fs = require('fs');
// This could list over 10,000 files of various size
const fileList = ['file1', 'file2', 'file3'];
const getCdate = file => fs.statSync(file).ctime; // Has async method
const getFSize = file => fs.statSync(file).size; // Has async method
// Can be async through file streams (see resources below)
const getMd5 = (file) => {
let fileData = new Buffer(0);
fileData = fs.readFileSync(file);
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
};
let filesObj = fileList.map(file => [file, {
datetime: getCdate(file),
filesize: getFSize(file),
md5hash: getMd5(file),
}]);
console.log(filesObj);
Notes:
We need to keep the functions modular and re-usable.
There are more functions getting things for filesObj than listed here
Most functions can be re-written to be async, some can not.
Ideally we need to keep the original order of fileList.
Ideally we want to use latest Node and JS features -- not rely on external modules.
Assorted file stream methods for getting md5 asynchronously:
Obtaining the hash of a file using the stream capabilities of crypto module (ie: without hash.update and hash.digest)
How to calculate md5 hash of a file using javascript
There are a variety of different ways you could handle this code asynchronously. You could use the node async library to handle all of the callbacks more elegantly. If you don't want to dive into promises then that's the "easy" option. I put easy in quotes because promises are actually easier if you understand them well enough. The async lib is helpful but it still leaves much to be desired in the way of error propagation, and there is a lot of boilerplate code you'll have to wrap all your calls in.
The better way is to use promises. Async/Await is still pretty new. Not even supported in node 7 (not sure about node 8) without a preprocessor like Bable or Typescript. Also, async/await uses promises under the hood anyway.
Here is how I would do it using promises, even included a file stats cache for maximum performance:
const fs = require('fs');
const crypto = require('crypto');
const Promise = require('bluebird');
const fileList = ['file1', 'file2', 'file3'];
// Use Bluebird's Promise.promisifyAll utility to turn all of fs'
// async functions into promise returning versions of them.
// The new promise-enabled methods will have the same name but with
// a suffix of "Async". Ex: fs.stat will be fs.statAsync.
Promise.promisifyAll(fs);
// Create a cache to store the file if we're planning to get multiple
// stats from it.
let cache = {
fileName: null,
fileStats: null
};
const getFileStats = (fileName, prop) => {
if (cache.fileName === fileName) {
return cache.fileStats[prop];
}
// Return a promise that eventually resolves to the data we're after
// but also stores fileStats in our cache for future calls.
return fs.statAsync(fileName).then(fileStats => {
cache.fileName = fileName;
cache.fileStats = fileStats;
return fileStats[prop];
})
};
const getMd5Hash = file => {
// Return a promise that eventually resolves to the hash we're after.
return fs.readFileAsync(file).then(fileData => {
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
});
};
// Create a promise that immediately resolves with our fileList array.
// Use Bluebird's Promise.map utility. Works very similar to Array.map
// except it expects all array entries to be promises that will
// eventually be resolved to the data we want.
let results = Promise.resolve(fileList).map(fileName => {
return Promise.all([
// This first gets a promise that starts resolving file stats
// asynchronously. When the promise resolves it will store file
// stats in a cache and then return the stats value we're after.
// Note that the final return is not a promise, but returning raw
// values from promise handlers implicitly does
// Promise.resolve(rawValue)
getFileStats(fileName, 'ctime'),
// This one will not return a promise. It will see cached file
// stats for our file and return the stats value from the cache
// instead. Since it's being returned into a Promise.all, it will
// be implicitly wrapped in Promise.resolve(rawValue) to fit the
// promise paradigm.
getFileStats(fileName, 'size'),
// First returns a promise that begins resolving the file data for
// our file. A promise handler in the function will then perform
// the operations we need to do on the file data in order to get
// the hash. The raw hash value is returned in the end and
// implicitly wrapped in Promise.resolve as well.
getMd5(file)
])
// .spread is a bluebird shortcut that replaces .then. If the value
// being resolved is an array (which it is because Promise.all will
// resolve an array containing the results in the same order as we
// listed the calls in the input array) then .spread will spread the
// values in that array out and pass them in as individual function
// parameters.
.spread((dateTime, fileSize, md5Hash) => [file, { dateTime, fileSize, md5Hash }]);
}).catch(error => {
// Any errors returned by any of the Async functions in this promise
// chain will be propagated here.
console.log(error);
});
Here's the code again but without comments to make it easier to look at:
const fs = require('fs');
const crypto = require('crypto');
const Promise = require('bluebird');
const fileList = ['file1', 'file2', 'file3'];
Promise.promisifyAll(fs);
let cache = {
fileName: null,
fileStats: null
};
const getFileStats = (fileName, prop) => {
if (cache.fileName === fileName) {
return cache.fileStats[prop];
}
return fs.statAsync(fileName).then(fileStats => {
cache.fileName = fileName;
cache.fileStats = fileStats;
return fileStats[prop];
})
};
const getMd5Hash = file => {
return fs.readFileAsync(file).then(fileData => {
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
});
};
let results = Promise.resolve(fileList).map(fileName => {
return Promise.all([
getFileStats(fileName, 'ctime'),
getFileStats(fileName, 'size'),
getMd5(file)
]).spread((dateTime, fileSize, md5Hash) => [file, { dateTime, fileSize, md5Hash }]);
}).catch(console.log);
In the end results will be an array like which should hopefully match the results of your original code but should perform much better in a benchmark:
[
['file1', { dateTime: 'data here', fileSize: 'data here', md5Hash: 'data here' }],
['file2', { dateTime: 'data here', fileSize: 'data here', md5Hash: 'data here' }],
['file3', { dateTime: 'data here', fileSize: 'data here', md5Hash: 'data here' }]
]
Apologies in advance for any typos. Didn't have the time or ability to actually run any of this. I looked over it quite extensively though.
After discovering that async/await is in node as of 7.6 I decided to play with it a bit last night. It seems most useful for recursive async tasks that don't need to be done in parallel, or for nested async tasks that you might wish you could write synchronously. For what you needed here there isn't any mind-blowing way to use async/await that I can see but there are a few places where the code would read more cleanly. Here's the code again but with a few little async/await conveniences.
const fs = require('fs');
const crypto = require('crypto');
const Promise = require('bluebird');
const fileList = ['file1', 'file2', 'file3'];
Promise.promisifyAll(fs);
let cache = {
fileName: null,
fileStats: null
};
async function getFileStats (fileName, prop) {
if (cache.fileName === fileName) {
return cache.fileStats[prop];
}
let fileStats = await fs.stat(fileName);
cache.fileName = fileName;
cache.fileStats = fileStats;
return fileStats[prop];
};
async function getMd5Hash (file) {
let fileData = await fs.readFileAsync(file);
const hash = crypto.createHash('md5');
hash.update(fileData);
return hash.digest('hex');
};
let results = Promise.resolve(fileList).map(fileName => {
return Promise.all([
getFileStats(fileName, 'ctime'),
getFileStats(fileName, 'size'),
getMd5(file)
]).spread((dateTime, fileSize, md5Hash) => [file, { dateTime, fileSize, md5Hash }]);
}).catch(console.log);
I would make getCDate, getFSize, and getMd5 all asynchronous and promisified then wrap them in another asynchronous promise-returning function, here called statFile.
function statFile(file) {
return Promise.all([
getCDate(file),
getFSize(file),
getMd5(file)
]).then((datetime, filesize, md5hash) => ({datetime, filesize, md5hash}))
.catch(/*handle error*/);
}
Then you could change your mapping function to
const promises = fileList.map(statFile);
Then it's simple to use Promise.all:
Promise.all(promises)
.then(filesObj => /*do something*/)
.catch(err => /*handle error*/)
This leaves things modular, doesn't require async/await, allows you to plug extra functions into statFile, and preserves your file order.

NodeJS, promises, streams - processing large CSV files

I need to build a function for processing large CSV files for use in a bluebird.map() call. Given the potential sizes of the file, I'd like to use streaming.
This function should accept a stream (a CSV file) and a function (that processes the chunks from the stream) and return a promise when the file is read to end (resolved) or errors (rejected).
So, I start with:
'use strict';
var _ = require('lodash');
var promise = require('bluebird');
var csv = require('csv');
var stream = require('stream');
var pgp = require('pg-promise')({promiseLib: promise});
api.parsers.processCsvStream = function(passedStream, processor) {
var parser = csv.parse(passedStream, {trim: true});
passedStream.pipe(parser);
// use readable or data event?
parser.on('readable', function() {
// call processor, which may be async
// how do I throttle the amount of promises generated
});
var db = pgp(api.config.mailroom.fileMakerDbConfig);
return new Promise(function(resolve, reject) {
parser.on('end', resolve);
parser.on('error', reject);
});
}
Now, I have two inter-related issues:
I need to throttle the actual amount of data being processed, so as to not create memory pressures.
The function passed as the processor param is going to often be async, such as saving the contents of the file to the db via a library that is promise-based (right now: pg-promise). As such, it will create a promise in memory and move on, repeatedly.
The pg-promise library has functions to manage this, like page(), but I'm not able to wrap my ahead around how to mix stream event handlers with these promise methods. Right now, I return a promise in the handler for readable section after each read(), which means I create a huge amount of promised database operations and eventually fault out because I hit a process memory limit.
Does anyone have a working example of this that I can use as a jumping point?
UPDATE: Probably more than one way to skin the cat, but this works:
'use strict';
var _ = require('lodash');
var promise = require('bluebird');
var csv = require('csv');
var stream = require('stream');
var pgp = require('pg-promise')({promiseLib: promise});
api.parsers.processCsvStream = function(passedStream, processor) {
// some checks trimmed out for example
var db = pgp(api.config.mailroom.fileMakerDbConfig);
var parser = csv.parse(passedStream, {trim: true});
passedStream.pipe(parser);
var readDataFromStream = function(index, data, delay) {
var records = [];
var record;
do {
record = parser.read();
if(record != null)
records.push(record);
} while(record != null && (records.length < api.config.mailroom.fileParserConcurrency))
parser.pause();
if(records.length)
return records;
};
var processData = function(index, data, delay) {
console.log('processData(' + index + ') > data: ', data);
parser.resume();
};
parser.on('readable', function() {
db.task(function(tsk) {
this.page(readDataFromStream, processData);
});
});
return new Promise(function(resolve, reject) {
parser.on('end', resolve);
parser.on('error', reject);
});
}
Anyone sees a potential problem with this approach?
You might want to look at promise-streams
var ps = require('promise-streams');
passedStream
.pipe(csv.parse({trim: true}))
.pipe(ps.map({concurrent: 4}, row => processRowDataWhichMightBeAsyncAndReturnPromise(row)))
.wait().then(_ => {
console.log("All done!");
});
Works with backpressure and everything.
Find below a complete application that correctly executes the same kind of task as you want: It reads a file as a stream, parses it as a CSV and inserts each row into the database.
const fs = require('fs');
const promise = require('bluebird');
const csv = require('csv-parse');
const pgp = require('pg-promise')({promiseLib: promise});
const cn = "postgres://postgres:password#localhost:5432/test_db";
const rs = fs.createReadStream('primes.csv');
const db = pgp(cn);
function receiver(_, data) {
function source(index) {
if (index < data.length) {
// here we insert just the first column value that contains a prime number;
return this.none('insert into primes values($1)', data[index][0]);
}
}
return this.sequence(source);
}
db.task(t => {
return pgp.spex.stream.read.call(t, rs.pipe(csv()), receiver);
})
.then(data => {
console.log('DATA:', data);
}
.catch(error => {
console.log('ERROR:', error);
});
Note that the only thing I changed: using library csv-parse instead of csv, as a better alternative.
Added use of method stream.read from the spex library, which properly serves a Readable stream for use with promises.
I found a slightly better way of doing the same thing; with more control. This is a minimal skeleton with precise parallelism control. With parallel value as one all records are processed in sequence without having the entire file in memory, we can increase parallel value for faster processing.
const csv = require('csv');
const csvParser = require('csv-parser')
const fs = require('fs');
const readStream = fs.createReadStream('IN');
const writeStream = fs.createWriteStream('OUT');
const transform = csv.transform({ parallel: 1 }, (record, done) => {
asyncTask(...) // return Promise
.then(result => {
// ... do something when success
return done(null, record);
}, (err) => {
// ... do something when error
return done(null, record);
})
}
);
readStream
.pipe(csvParser())
.pipe(transform)
.pipe(csv.stringify())
.pipe(writeStream);
This allows doing an async task for each record.
To return a promise instead we can return with an empty promise, and complete it when stream finishes.
.on('end',function() {
//do something wiht csvData
console.log(csvData);
});
So to say you don't want streaming but some kind of data chunks? ;-)
Do you know https://github.com/substack/stream-handbook?
I think the simplest approach without changing your architecture would be some kind of promise pool. e.g. https://github.com/timdp/es6-promise-pool

Resources