My goal is to insert VERY large csv's, so right not I use the csv streaming like so:
var myCollection = db.collection(myCollectionId);
var q = async.queue(Collection.insert.bind(myCollection), 10);
csv()
.from.path(myFilePath, {columns: true})
.transform(function(data, index, cb){
q.push(data, function (err, res) {
if (err) return cb(err);
cb(null, res[0]);
});
})
.on('end', function () {
q.drain = function() {
//do some stufff
};
})
.on('error', function (err) {
res.end(500, err.message);
console.log('on.error() executed');
});
});
But when files get REALLY large, like 70M+ and it's streaming them, my server is very slow and it takes forever, and when i try to load pages on the website its lethargic during this process.
Why is it not possible to execute a mongo insert using cron-job like this. I ask because the same insert takes maybe 30 seconds from the mongo command line.
P.S. Don't mind the readFile and lines part, I am doing that because I want to test for when all the lines have been inserted into the collection after the process is started (haven't implemented this yet).
var cronJob = require('cron').CronJob;
var spawn = require('child_process').spawn;
var fs = require('fs');
function MongoImportEdgeFile(dataID, filePath){
var scriptPath = "/local/main/db/mongodb-linux-x86_64-2.4.5/bin/mongoimport";
console.log("script path = "+scriptPath)
var output = "";
fs.readFile(filePath, 'utf-8',function(err, data) {
if (err){
console.log(err)
throw err;
}
//console.log('data = '+data);
var lines = data.split('\n');
console.log("total lines in file = " + lines);
var job = new cronJob(new Date(), function() {
// store reference to 'this', which is cronJob object. needed to stop job after script is done executing.
var context = this;
// execute R script asynchronously
var script = spawn(scriptPath, [" -d mydb -c Data_ForID_" + dataID + " --file " + filePath + " --type csv" ]);
console.log("Executing R script via node-cron: " + scriptPath);
// script has finished executing, so complete cron job and fire completion callback
script.on('close', function() {
console.log('inside script.on(close, function() for import');
context.stop();
});
}, function() {
// callback function that executes upon completion
console.log("Finished executing import");
}, true);
});
}
You shouldn't use individual insert calls. You're forcing mongo to perform internal sync with each call -- I think it's even worse given your parallel approach.
Use bulk insertion: it's as easy as calling insert() with an array.
You could execute mongoimport directly from node by creating a child process. Here's an article on using mongoimport to import a csv. You can also do json.
Somehow I missed the part about using mongoimport inside cron. If I understand correctly it looks like you somehow know the csv's you would like to import, and you are using cron to check for them.
Have you considered a message queue? This will allow your processor to receive the import job instantaneously instead of on an interval. This will also throttle your processing.
If you need more throughput, you could create additional listener processes that are attached to the same queue. They will compete for the next job. This will allow your solution to scale.
Related
I have this function for deleting specific jars from a remote server:
deleteJars: function(appDir, version, callback) {
fs.readFile('/file/location', function(err, data) {
if(err) throw err;
var array = data.toString().split("\n");
for(i in array) {
if (array[i].indexOf('worker') > -1){
var ip = array[i].split(" ");
var ssh = new SSH2Utils();
var server = {
host: ip[0],
username: username,
password: password
};
var myfiles = ssh.exec(server, 'rm ' + appDir + '/' + version + '/jars/myjar*.jar', function(err,stdout,stderr, server, conn, response){
if(err) console.log('No jars to delete');
conn.end();
callback(response);
});
}
}
});
}
It gets called in my application with this:
runningService.deleteJars(appDir, version, function() {
});
Immediately after this I have a smilar call to a copyJars finction which copies new jar files to the same location and after that a job is run which uses trhe jars. My problem is that sometimes the copy is done before the delete so the new jars are copied to the folder and immediately deleted with the old ones. Have I done something wrong with my delete function that allows the application to continue to the next step before completing the delete?
{
deleteJars: function(appDir, version, callback) {
fs.readFile('/file/location', function (err, data) {
if (err) throw err;
var array = data.toString().split("\n");
const promises = [];
for (i in array) {
if (array[i].indexOf('worker') > -1) {
var ip = array[i].split(" ");
var ssh = new SSH2Utils();
var server = {
host: ip[0],
username: username,
password: password
};
promises.push(
new Promise((resolve, reject) => {
ssh.exec(server, 'rm ' + appDir + '/' + version + '/jars/myjar*.jar', function (err, stdout, stderr, server, conn, response) {
if (err) reject('No jars to delete');
conn.end();
resolve(response);
})
})
)
}
}
Promise.all(promises).then((results) => { // results will be in order
// use the results array
callback(null, results);
}).catch(err => {
console.log(err);
})
});
}
}
With callbacks, maintaining order might be a little tough - but with Promises - we have a simple API called Promise.all that you can use like above. To continue using callbacks, you can look into libraries like async that have ways to deal with this.
Without seeing how you have this worked out with your copyJars function, this seems like a classic synchronous vs asynchronous issue.
You are using an asynchronous function, fs.readFile, with a callback
. If I understand the basis of how Node.js works correctly, it hands the operation of finding and opening the file up and reading the contents to the OS, when that is done, the OS returns to node and says "here it is" and then node executes the callback with the file data. This means that while the OS is off finding the files you want to delete, node will continue executing code which seems to be your copyJars function. Depending on how quickly the OS returns to your node process with the required information, this may not happen in the order you expected.
Some solutions may be:
You can use fs.readFileSync. This will execute synchronously and halt execution of other code while this is being performed. deleteJars sounds like it's not a one time thing, so this may not be the most efficient solution.
You can implement promises or look into async/await.
I have about 1000 CSV files that need parsing. Each one contains about 1000 rows, for 1 million total records. The data need to be transformed and then saved to the db, which is why I have to do this through my app.
My problem is that the parser gradually slows down as it loops through the files, to the point where it will take forever to complete the run.
Here's how it's currently set up.
var files = [ file1Path, file2Path.... file1000Path ];
function parseFile(index) {
var startTime = new Date().getTime();
var filePath = files[index];
var stream = fs.createReadStream(filePath);
//parse using fast-csv npm module
csv.fromStream(stream, { config })
.on('data', function (row) {
transformAndSave(row);
})
.on('end', function () {
console.log( new Date().getTime() - startTime + " elapsed " );
parseFile(index + 1)
});
}
parseFile(0);
I've tried this a few different ways and it's basically the same thing every time. The first file completes in 2 seconds, by the 8th file we're at 5 or 6 seconds, later on it climbs to 24 seconds, etc. Other things I've tried include doing... files.forEach(function (file) { //run the parser }), doing batches of 100 at a time or even 5 at a time, and it makes no difference: it progressively slows down from a rate of 500 per second to 1 or 2 per second.
Does anybody have ideas for how I can prevent this slow down? Part of the reason could be that stream.on('end') completes before transformAndSave is finished, potentially creating a backlog. But at this point I'm out of ideas and would appreciate any help anyone could offer.
Thanks for much in advance!!
Daniel
note for Meteor people. I'm calling this function as a Meteor method. Not sure if that makes any difference, but in case it does, now you know.
Update
Here's is the log output demonstrating the steady rise in memory usage and processing time.
Seems like a resource problem, as in you're running out of memory. I would try an approach that doesn't use a recursive function which might allow resources to be released more readily. One approach could be to use async.
var Logger = require('arsenic-logger');
var fs = require('fs');
var async = require('async');
var csv = require('fast-csv');
var path = require('path');
Logger.echoMemoryUsage();
var testDir = path.resolve(__dirname, 'test');
fs.readdir(testDir, (err, files) => {
Logger.debug(files);
if (err) {
Logger.error(err);
}
async.mapLimit(files, 2, function(file, cb) {
var startTime = new Date().getTime();
var stream = fs.createReadStream(testDir+'/'+file);
Logger.debug("Reading: " + file);
config = {};
//parse using fast-csv npm module
csv.fromStream(stream, config)
.on('data', function(row) {
//Logger.debug(row);
//transformAndSave(row);
})
.on('error', function(err) {
Logger.error(err);
cb(err);
})
.on('end', function() {
Logger.debug(new Date().getTime() - startTime + " elapsed ");
setTimeout(cb, 1000);
});
}, function(err, results) {
Logger.info("Finished!");
process.exit(1);
});
});
I wish to update a Mongo collection in some code that looks like this:
var Q = Npm.require('q');
var db = new Mongo.Collection('mydb');
function doSomething() {
var d = Q.defer();
setTimeout( function() {
d.resolve();
}, 1000);
return d.promise;
}
doSomething().then( function() {
console.log('before find');
var records = db.find({}).fetch(); // blocking operation never completes
console.log('after find');
console.log(records); // should be []
});
When running meteor with the above code, it will get as far as logging "before find" but then the execution is halted waiting for db.find to complete. It never completes.
Are there any solutions or workarounds for this?
Update: it seems to be the .fetch() that causes the issue. I need this part though, I want to manipulate the data I am receiving from Mongo.
Instead of using fetch, add a callback function.
It will allow you to manipulate the data after its retrieved:
var records = db.find({}, function(error, data){
// Do something with your data here
});
** Edit - with the callback above, a curser is returned. If you want to return an Array with the results, use the following:
var records = db.find({}).toArray(function(error, data){
// Do something with your data here
});
I have a grunt task from which I would like to run a node command. The command is not giving any error when I run it, but I was expecting some console output from the task, which I don't seem to be getting at all.
What am I missing in order to run this node task?
grunt.registerTask('asyncfoo', 'My "asyncfoo" task.', function() {
// Force task into async mode and grab a handle to the "done" function.
var done = this.async();
// Run some sync stuff.
grunt.log.writeln('Processing task...');
grunt.util.spawn({ cmd: 'node', args: ['S3ListBuckets.js']});
// And some async stuff.
setTimeout(function() {
grunt.log.writeln('All done!');
done();
}, 1000);
});
!-- if someone else is wanting to do something similar here is the code
module.exports = function(grunt) {
grunt.registerTask('asyncfoo', 'My "asyncfoo" task.', function() {
// Force task into async mode and grab a handle to the "done" function.
var done = this.async();
// Run some sync stuff.
grunt.log.writeln('Processing task...');
grunt.util.spawn({ cmd: 'node', args: ['S3ListBuckets.js'], opts: {stdio: 'inherit'}});
});
};
!-- list buckets
var fs = require('fs');
var aws = require('aws-sdk');
aws.config.loadFromPath('./grunt-aws.json');
var s3 = new aws.S3();
s3.listBuckets(function (err, data) {
if (err) {
console.log("Error:", err);
}
else {
for (var index in data.Buckets) {
var bucket = data.Buckets[index];
console.log("Bucket: ", bucket.Name, ' : ', bucket.CreationDate);
}
}
});
The answer https://stackoverflow.com/a/15045126/519995 suggests using the parameter opts: {stdio: 'inherit'} to have the spawned output streamed into the parent output stream.
That same answer also lists other alternatives: listening to data event, or piping the streams as you wish.
Also, using timeouts to wait for async tasks is NOT a good idea. If all you are waiting for is the spawned process you can use a callback to know when its done. If you have more complex sync I suggest starting a new StackOverflow question.
I am using twitter API in my code and mongodb. The is reflecting the correct output in database, but it's not terminating. I guess the problem is with db.server.find({id:myid},cb); statement in code below. However, I don't know how to work it out.
var Twit = require('../lib/twitter'),
conf = require('../config1');
var myid;
var twit = new Twit(conf);
var databaseUrl = "mydb2"; // "username:password#example.com/mydb"
var collections = ["server", "followers"];
var db = require("mongojs").connect(databaseUrl, collections);
twit.get('account/verify_credentials', function (err, reply) {
myid = reply.id;
function addToServer(myid, cb) {
db.server.find({
id: myid
}, cb);
};
addToServer(myid, function (err, resp) {
if (err) {
console.log("err");
} else if (resp.length > 0) {
console.log("My Id present in server present");
} else {
console.log("New to the app.So updating server ");
db.server.insert({
id: myid
});
db.followers.insert({
id: myid,
following: []
})
}
});
});
P.S: This is a part of my code , I have also used process.exit(0) function, but still no help.
I think your issue is related to this: https://github.com/mafintosh/mongojs/issues/15.
Here's a gist. If I call db.close() the program exists, and if I don't, it doesn't. So process.on('exit') must not be the right place to call it.
But the issue is that that you have a persistent tcp connection open to the DB, and as long as that's running, the script won't shut down.
Is this a run-once script, or do you need to keep this thing running?
EDIT:
Since the script only needs to run once, I'd use callbacks on your 2 database queries and close the database down in the last callback.