Meteor / Node.js parsing large number of files gets very slow

Meteor / Node.js parsing large number of files gets very slow - node.js

I have about 1000 CSV files that need parsing. Each one contains about 1000 rows, for 1 million total records. The data need to be transformed and then saved to the db, which is why I have to do this through my app.
My problem is that the parser gradually slows down as it loops through the files, to the point where it will take forever to complete the run.
Here's how it's currently set up.
var files = [ file1Path, file2Path.... file1000Path ];
function parseFile(index) {
var startTime = new Date().getTime();
var filePath = files[index];
var stream = fs.createReadStream(filePath);
//parse using fast-csv npm module
csv.fromStream(stream, { config })
.on('data', function (row) {
transformAndSave(row);
})
.on('end', function () {
console.log( new Date().getTime() - startTime + " elapsed " );
parseFile(index + 1)
});
}
parseFile(0);
I've tried this a few different ways and it's basically the same thing every time. The first file completes in 2 seconds, by the 8th file we're at 5 or 6 seconds, later on it climbs to 24 seconds, etc. Other things I've tried include doing... files.forEach(function (file) { //run the parser }), doing batches of 100 at a time or even 5 at a time, and it makes no difference: it progressively slows down from a rate of 500 per second to 1 or 2 per second.
Does anybody have ideas for how I can prevent this slow down? Part of the reason could be that stream.on('end') completes before transformAndSave is finished, potentially creating a backlog. But at this point I'm out of ideas and would appreciate any help anyone could offer.
Thanks for much in advance!!
Daniel
note for Meteor people. I'm calling this function as a Meteor method. Not sure if that makes any difference, but in case it does, now you know.
Update
Here's is the log output demonstrating the steady rise in memory usage and processing time.

Seems like a resource problem, as in you're running out of memory. I would try an approach that doesn't use a recursive function which might allow resources to be released more readily. One approach could be to use async.
var Logger = require('arsenic-logger');
var fs = require('fs');
var async = require('async');
var csv = require('fast-csv');
var path = require('path');
Logger.echoMemoryUsage();
var testDir = path.resolve(__dirname, 'test');
fs.readdir(testDir, (err, files) => {
Logger.debug(files);
if (err) {
Logger.error(err);
}
async.mapLimit(files, 2, function(file, cb) {
var startTime = new Date().getTime();
var stream = fs.createReadStream(testDir+'/'+file);
Logger.debug("Reading: " + file);
config = {};
//parse using fast-csv npm module
csv.fromStream(stream, config)
.on('data', function(row) {
//Logger.debug(row);
//transformAndSave(row);
})
.on('error', function(err) {
Logger.error(err);
cb(err);
})
.on('end', function() {
Logger.debug(new Date().getTime() - startTime + " elapsed ");
setTimeout(cb, 1000);
});
}, function(err, results) {
Logger.info("Finished!");
process.exit(1);
});
});

Related

Read newest csv records with Node

I'd like to watch a CSV file and get the newest records since it was changed. I'm running the following shell command to build a very simple csv file and append a new line every second:
rm test.csv & x=0 && while true; do echo "${x},${x},${x}" >> test.csv; x=$(($x+1)); sleep 1; done
The following code prints all the records of the file until the first change and then just emits the dashed line, as if it's not re-reading the file:
'use strict';
var fs = require('fs'),
dataFile = __dirname + '/server/data/test.csv',
csv = require('csv');
var parser = csv.parse({delimiter: ','}, function(err, data){
console.log(data);
});
var watcher = fs.watch(dataFile);
watcher.on('change', fileChange);
function fileChange(e, fn){
if (e) console.error(e)
fs.createReadStream(dataFile).pipe(parser);
console.log('-------')
}
Shouldn't the fileChange function re-read the file on every change? My ultimate plan here is to get both the previous array of lines and the current one and use lodash's difference function to return only the differences. If there's better way, I'm open to hear it though.

My guess is that fs.createReadStream() has opened the file and it's not being closed. So on the second event fs.createReadStream() fails. No bueno.
Try using fs.readFile() instead like this:
function fileChange(e, fn){
if (e) console.error(e)
fs.readFile(dataFile, function (err, data) {
if (err) throw err;
console.log(data);
console.log('-------')
});
};
See the documentation here: http://nodejs.org/api/fs.html#fs_fs_readfile_filename_options_callback

I ended up solving the issue by stating the file on change, and reading the difference in size to the stream data:
'use strict';
var fs = require('fs'),
dataFile = __dirname + '/server/data/test.csv',
readSize = 0,
csv = require('csv');
var parser = csv.parse();
parser.on('readable', function(data){
var record;
while(record = parser.read()){
console.log(record);
}
});
var watcher = fs.watch(dataFile);
watcher.on('change', fileChange);
// fires when the watched file changes
function fileChange(e, fn){
// get these syncronously
var stats = fs.statSync(dataFile);
// if it's smaller, wait half a second
if (stats.size <= readSize) {
setTimeout(fileChange, 500);
}
// read the stream offset
var stream = fs.createReadStream(dataFile, {start: readSize, end: stats.size});
stream.on('data', function(chunk){
parser.write(chunk.toString());
});
readSize = stats.size;
}
Any feedback on why this may not work would be appreciated.

Spawning a mongoinsert

My goal is to insert VERY large csv's, so right not I use the csv streaming like so:
var myCollection = db.collection(myCollectionId);
var q = async.queue(Collection.insert.bind(myCollection), 10);
csv()
.from.path(myFilePath, {columns: true})
.transform(function(data, index, cb){
q.push(data, function (err, res) {
if (err) return cb(err);
cb(null, res[0]);
});
})
.on('end', function () {
q.drain = function() {
//do some stufff
};
})
.on('error', function (err) {
res.end(500, err.message);
console.log('on.error() executed');
});
});
But when files get REALLY large, like 70M+ and it's streaming them, my server is very slow and it takes forever, and when i try to load pages on the website its lethargic during this process.
Why is it not possible to execute a mongo insert using cron-job like this. I ask because the same insert takes maybe 30 seconds from the mongo command line.
P.S. Don't mind the readFile and lines part, I am doing that because I want to test for when all the lines have been inserted into the collection after the process is started (haven't implemented this yet).
var cronJob = require('cron').CronJob;
var spawn = require('child_process').spawn;
var fs = require('fs');
function MongoImportEdgeFile(dataID, filePath){
var scriptPath = "/local/main/db/mongodb-linux-x86_64-2.4.5/bin/mongoimport";
console.log("script path = "+scriptPath)
var output = "";
fs.readFile(filePath, 'utf-8',function(err, data) {
if (err){
console.log(err)
throw err;
}
//console.log('data = '+data);
var lines = data.split('\n');
console.log("total lines in file = " + lines);
var job = new cronJob(new Date(), function() {
// store reference to 'this', which is cronJob object. needed to stop job after script is done executing.
var context = this;
// execute R script asynchronously
var script = spawn(scriptPath, [" -d mydb -c Data_ForID_" + dataID + " --file " + filePath + " --type csv" ]);
console.log("Executing R script via node-cron: " + scriptPath);
// script has finished executing, so complete cron job and fire completion callback
script.on('close', function() {
console.log('inside script.on(close, function() for import');
context.stop();
});
}, function() {
// callback function that executes upon completion
console.log("Finished executing import");
}, true);
});
}

You shouldn't use individual insert calls. You're forcing mongo to perform internal sync with each call -- I think it's even worse given your parallel approach.
Use bulk insertion: it's as easy as calling insert() with an array.

You could execute mongoimport directly from node by creating a child process. Here's an article on using mongoimport to import a csv. You can also do json.
Somehow I missed the part about using mongoimport inside cron. If I understand correctly it looks like you somehow know the csv's you would like to import, and you are using cron to check for them.
Have you considered a message queue? This will allow your processor to receive the import job instantaneously instead of on an interval. This will also throttle your processing.
If you need more throughput, you could create additional listener processes that are attached to the same queue. They will compete for the next job. This will allow your solution to scale.

Node.js app continuously consume memory without freeing it

I'm facing a weird situation. I wrote an application that is performing a HTTP GET request every five minutes. Something like this:
// pingy
'use strict';
var https = require('https'),
url = require('url');
exports.ping = function ping (callback) {
var options = url.parse('https://host.tld/ping');
callback = callback || function () {};
https.get(options, function handler (response) {
var body = '';
response
.on('readable', function onReadable() {
body = body + response.read();
})
.on('end', function onEnd() {
return callback(null, body);
})
.on('error', function onError (err) {
return callback(err);
});
});
};
// in other module
var pingy = require('./lib/pingy');
setInterval(pingy.ping, 300000);
Pretty straightforward. The thing is that after some time, the "rss" from process.memoryUsage() climbs and climbs. Looks like that the created ClientRequest objects will never be GCed(). Although I'm using https here in this example, the same happens if using the http module.
Do you have any idea what is wrong here?
EDIT:
I've solved the problem above (see below in my comment). Now I'm facing a different problem, which is really, really hard to track down (used node-webkit-agent in order to analyze the memory usage, but nothing really special. The heap looks stable to me). The scenario is also nothing special I'm copying round about 200 images from source to dest via Streams (see code below). What happens is, that the "RSS" increases also. I'm pretty sure that there is something wrong with my code regarding how to pipe the files. Don't get me wrong I have no problem with a high memory usage. With what I have a problem is, that the memory never will be freed. In order to verify that the memory will be cleared in some point in the future, I start a http.createServer after every single file has been copied. Even after a couple of hours the "rss" value stays the same.
So, well, again, do you have any idea what is wrong here? Thanks in advance for every hint! :)
'use strict';
var http = require('http'),
fs = require('fs'),
path = require('path'),
util = require('util'),
directory = '/path/to/your/files/',
jobs = [];
function printMemoryUsage () {
var memory = process.memoryUsage();
memory.rss = (memory.rss / 1024) | 0;
memory.heapTotal = (memory.heapTotal / 1024) | 0;
memory.heapUsed = (memory.heapUsed / 1024) | 0;
console.log(JSON.stringify(memory));
}
function pipeFile() {
var infile = jobs.pop(),
outfile = jobs.pop(),
instream = fs.createReadStream(infile),
outstream = fs.createWriteStream(outfile);
instream.pipe(outstream);
instream.on('close', outstream.end.bind(outstream));
outstream.on('finish', function onFinish () {
// console.log('Finished %s -> %s', infile, outfile);
instream.destroy();
outstream.destroy();
next();
});
}
function next() {
if (jobs.length) {
setTimeout(pipeFile, 2000);
} else {
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end('Fooboot\n');
}).listen(1337, '127.0.0.1');
}
}
fs.readdir(directory, function (err, files) {
files.forEach(function onIteration (file) {
jobs.push(path.join(__dirname, file)); // outfile
jobs.push(path.join(directory, file)); // infile
});
next();
});
setInterval(printMemoryUsage, 3000);
These are the memory footprints:
{"rss":13904,"heapTotal":6963,"heapUsed":1758}
{"rss":16012,"heapTotal":6963,"heapUsed":2016}
{"rss":26040,"heapTotal":6963,"heapUsed":2265}
{"rss":31468,"heapTotal":6963,"heapUsed":2453}
{"rss":41080,"heapTotal":6963,"heapUsed":2712}
{"rss":46620,"heapTotal":6963,"heapUsed":2844}
{"rss":49260,"heapTotal":6963,"heapUsed":1999}
{"rss":49524,"heapTotal":6963,"heapUsed":2249}
{"rss":49524,"heapTotal":6963,"heapUsed":2362}
{"rss":49788,"heapTotal":6963,"heapUsed":2621}
{"rss":49788,"heapTotal":6963,"heapUsed":2755}
{"rss":52692,"heapTotal":6963,"heapUsed":2001}
{"rss":52692,"heapTotal":6963,"heapUsed":2138}
{"rss":52692,"heapTotal":6963,"heapUsed":2270}
{"rss":52692,"heapTotal":6963,"heapUsed":2483}
{"rss":52692,"heapTotal":6963,"heapUsed":2600}
{"rss":52692,"heapTotal":6963,"heapUsed":2796}
{"rss":52956,"heapTotal":6963,"heapUsed":1951}
{"rss":52956,"heapTotal":6963,"heapUsed":2079}
{"rss":52956,"heapTotal":6963,"heapUsed":2343}
{"rss":52956,"heapTotal":6963,"heapUsed":2462}
{"rss":52956,"heapTotal":6963,"heapUsed":2689}
{"rss":52956,"heapTotal":6963,"heapUsed":2831}
{"rss":53136,"heapTotal":9011,"heapUsed":1927}
{"rss":53136,"heapTotal":9011,"heapUsed":2176}
{"rss":53136,"heapTotal":9011,"heapUsed":2273}
{"rss":53136,"heapTotal":9011,"heapUsed":2447}
{"rss":53136,"heapTotal":9011,"heapUsed":2545}
{"rss":53136,"heapTotal":9011,"heapUsed":2627}
{"rss":53136,"heapTotal":9011,"heapUsed":2804}
{"rss":53136,"heapTotal":9011,"heapUsed":2890}
{"rss":59732,"heapTotal":9011,"heapUsed":3100}
{"rss":65012,"heapTotal":9011,"heapUsed":3211}
{"rss":73496,"heapTotal":9011,"heapUsed":3409}
{"rss":79304,"heapTotal":9011,"heapUsed":3536}
{"rss":83792,"heapTotal":9011,"heapUsed":3633}
{"rss":95408,"heapTotal":9011,"heapUsed":3865}
{"rss":98840,"heapTotal":9011,"heapUsed":1824}
{"rss":98840,"heapTotal":9011,"heapUsed":2003}
{"rss":98840,"heapTotal":9011,"heapUsed":2205}
{"rss":98840,"heapTotal":9011,"heapUsed":2297}
{"rss":98840,"heapTotal":9011,"heapUsed":2491}
{"rss":98840,"heapTotal":9011,"heapUsed":2608}
{"rss":98840,"heapTotal":9011,"heapUsed":2717}
{"rss":98840,"heapTotal":9011,"heapUsed":2919}
{"rss":99368,"heapTotal":9011,"heapUsed":3036}
{"rss":99368,"heapTotal":9011,"heapUsed":3247}
{"rss":99632,"heapTotal":9011,"heapUsed":3351}
{"rss":99632,"heapTotal":9011,"heapUsed":3452}
{"rss":99896,"heapTotal":9011,"heapUsed":3606}
{"rss":99896,"heapTotal":9011,"heapUsed":3686}
{"rss":105968,"heapTotal":9011,"heapUsed":3824}
{"rss":106760,"heapTotal":9011,"heapUsed":1936}
{"rss":106760,"heapTotal":9011,"heapUsed":2022}
{"rss":106760,"heapTotal":9011,"heapUsed":2187}
{"rss":106760,"heapTotal":9011,"heapUsed":2279}
{"rss":106760,"heapTotal":9011,"heapUsed":2474}
{"rss":106760,"heapTotal":9011,"heapUsed":2614}
{"rss":106760,"heapTotal":9011,"heapUsed":2690}
{"rss":106760,"heapTotal":9011,"heapUsed":2854}
{"rss":106760,"heapTotal":9011,"heapUsed":2953}
{"rss":106760,"heapTotal":9011,"heapUsed":3241}
{"rss":106760,"heapTotal":9011,"heapUsed":3391}
{"rss":106760,"heapTotal":9011,"heapUsed":3535}
{"rss":107288,"heapTotal":9011,"heapUsed":3797}
{"rss":108248,"heapTotal":9011,"heapUsed":1908}

Reading CSV file and sending data in intervals with websockets (Node, Socket.io)

I'm relatively new to Node and Express.js. I'm trying to create a websocket server to push CSV data in irregular intervals stored in the file itself, line after line.
The CSV structure is something like this:
[timeout [ms], data1, data2, data3 ...]
I've successfully created a websocket server which communicates with the client.
I'm looking for a best solution to effectively do something like this:
1. Read a line of the CSV file
2. Send a line with WebSockets
3. Pause the reading for a period of time stored in the first value of the row
4. Resume the reading after the interval has passed, and back to step 1.
So far, I got this far (please feel free to trash my code completely as it might be very wrong - as I said, I'm new to it. It seems like the pause() doesn't do anything.
var $ = require('jquery')
,csv = require('csv');
exports.index = function(server){
var io = require('socket.io').listen(server);
io.sockets.on('connection', function (socket) {
socket.on('startTransmission', function(msg) {
csv()
.from.path('C:/dev/node_express/csv/test.csv', { delimiter: ',', escape: '"' })
.on('record', function(row,index){
var rowArray = $.parseJSON(JSON.stringify(row));
var json = {},
that = this;
$.each(rowArray, function(i,value){
json[keys[i]] = value;
});
socket.emit('transmitDataData', json);
//this.pause(); //I guess around here is where I'd like to pause
// setTimeout(function(){
// that.resume(); //and resume here after the timeout, stored in the first value (rowArray[0])
// }, rowArray[0]);
});
});
});
};
The commented out code unfortunately does not work - All data is sent immediately, row after row, the function doesn't pause

I ran into the same sort of thing with another use case. The issue is that calling pause() on the stream pauses the underlying stream reading but not the csv record parsing, so the record event can get called with the remainder of the records that made up the last read stream chunk. I synchronized them, in my case, like this:
var rows=0, actions=0;
stream.on('record', function(row, index){
rows++;
// pause here, but expect more record events until the raw read stream is exhausted
stream.pause();
runner.do(row, function(err, result) {
// when actions have caught up to rows read, read more rows.
if (actions==rows) {
stream.resume();
}
});
});
In your case, I'd buffer the rows and release them with the timer. Here's an untested re-factoring just to give you an idea of what I mean:
var $ = require('jquery'),
csv = require('csv');
exports.index = function(server){
var io = require('socket.io').listen(server);
io.sockets.on('connection', function (socket) {
socket.on('startTransmission', function(msg) {
var timer=null, buffered=[], stream=csv().from.path('C:/dev/node_express/csv/test.csv', { delimiter: ',', escape: '"' });
function transmit(row) {
socket.emit('transmitDataData', row);
}
function drain(timeout) {
if (!timer) {
timer = setTimeout(function() {
timer = null;
if (buffered.length<=1) { // get more rows ahead of time so we don't run out. otherwise, we could skip a beat.
stream.resume(); // get more rows
} else {
var row = buffered.shift();
transmit(row);
drain(row[0]);
}
}, timeout);
}
}
stream.on('record', function(row,index){
stream.pause();
if (index == 0) {
transmit(row);
} else {
buffered.push(row);
}
drain(row[0]); // assuming row[0] contains a timeout value.
});
stream.on('end', function() {
// no more rows. wait for buffer to empty, then cleanup.
});
stream.on('error', function() {
// handle error.
});
});
};

streams with percentage complete

I need to stream a file in base64 to an http endpoint using something like request or superagent. What is the best way to figure out what percentage of the file has been uploaded?
I assume I can create the read stream using something like:
fs.createReadStream('/tmp/cats.jpg', {encoding: 'base64'})
Any examples using one out of above libraries would be greatly appreciated.

I think you can use progress-stream.
Here is an example from the package:
var progress = require('progress-stream');
var fs = require('fs');
var stat = fs.statSync(filename);
var str = progress({
length: stat.size,
time: 100 /* ms */
});
str.on('progress', function(progress) {
console.log(progress);
/*
{
percentage: 9.05,
transferred: 949624,
length: 10485760,
remaining: 9536136,
eta: 42,
runtime: 3,
delta: 295396,
speed: 949624
}
*/
});
fs.createReadStream(filename)
.pipe(str)
.pipe(fs.createWriteStream(output));

I was looking for an answer to a similar issue and thanks to Alberto Zaccagni's answer, I was able to get some code working.
So for the people who don't want to piece the puzzle themselves, here is the code (edited for Stackoverflow):
var zipfile = "my_large_archive.zip";
// Get the size of the file
fs.stat(zipfile, function (err, stats) {
var zipSize = stats.size;
var uploadedSize = 0; // Incremented by on('data') to keep track of the amount of data we've uploaded
// Create a new read stream so we can plug events on it, and get the upload progress
var zipReadStream = fs.createReadStream(zipfile);
zipReadStream.on('data', function(buffer) {
var segmentLength = buffer.length;
// Increment the uploaded data counter
uploadedSize += segmentLength;
// Display the upload percentage
console.log("Progress:\t",((uploadedSize/zipSize*100).toFixed(2)+"%"));
});
// Some other events you might want for your code
zipReadStream.on('end', function() {
console.log("Event: end");
});
zipReadStream.on('close', function() {
console.log("Event: close");
});
var formData = require('form-data');
var form = new formData();
form.append('apikey', 'f4sd5f4sdf6ds456'); // Just some post parameters I need to send to the upload endpoint
form.append('file', zipReadStream); // The zip file, passed as a fs.createReadStream instance
// Submit the form and the file
form.submit('http://www.someserver.com/upload', function(err, res) {
if (err) {
console.log("Oups! We encountered an error :(\n\n", err);
return false;
}
console.log("Your file has been uploaded.");
res.resume(); // Fix is you use that code for a CLI, so that the execution will stop and let users enter new commands
});
});

In nodejs we have the Readable stream, it emits the data event when it receives a chunk of data, by knowing the file size you could easily keep track of how much data passes through the data event receiver and then update the percentage.
Get the file dimension with
require('fs').watchFile('yourfile', function () {
fs.stat('yourfile', function (err, stats) {
console.log(stats.size);
});
});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Meteor / Node.js parsing large number of files gets very slow - node.js

Related

Read newest csv records with Node

Spawning a mongoinsert

Node.js app continuously consume memory without freeing it

Reading CSV file and sending data in intervals with websockets (Node, Socket.io)

streams with percentage complete

Categories

Resources