Insert a large csv file, 200'000 rows+, into MongoDB in NodeJS - node.js

I'm trying to parse and insert a big csv file into MongoDB but when the file extends 100'000 rows I get a bad response from the server. And the files I need to insert are usually above 200'000 rows.
I've tried both bulk insert (insertMany) and Babyparse(Papaparse) streaming approach to insert the file row by row. But with poor results.
Node api:
router.post('/csv-upload/:id', multipartMiddleware, function(req, res) {
// Post vartiables
var fileId = req.params.id;
var csv = req.files.files.path;
// create a queue object with concurrency 5
var q = async.queue(function(row, callback) {
var entry = new Entry(row);
entry.save();
callback();
}, 5);
baby.parseFiles(csv, {
header: true, // Includes header in JSON
skipEmptyLines: true,
fastMode: true,
step: function(results, parser) {
results.data[0].id = fileId;
q.push(results.data[0], function (err) {
if (err) {throw err};
});
},
complete: function(results, file) {
console.log("Parsing complete:", results, file);
q.drain = function() {
console.log('All items have been processed');
res.send("Completed!");
};
}
});
});
This streaming approach results in: POST SERVER net::ERR_EMPTY_RESPONSE
Not sure if I'm using the async.queue correctly though.
Is there a better and more efficient way to do this OR am I doing something wrong?
Express Server:
// Dependencies
var express = require('express');
var path = require('path');
var bodyParser = require('body-parser');
var routes = require('./server/routes');
var mongoose = require("mongoose");
var babel = require("babel-core/register");
var compression = require('compression');
var PORT = process.env.PORT || 3000;
// Include the cluster module
var cluster = require('cluster');
mongoose.connect(process.env.MONGOLAB_URI || 'mongodb://localhost/routes');
// Code to run if we're in the master process
if (cluster.isMaster) {
// Count the machine's CPUs
var cpuCount = require('os').cpus().length;
// Create a worker for each CPU
for (var i = 0; i < cpuCount; i += 1) {
cluster.fork();
}
// Code to run if we're in a worker process
} else {
// Express
var app = express();
app.use(bodyParser.json({limit: '50mb'}));
app.use(bodyParser.urlencoded({limit: '50mb', extended: true}));
// Compress responses
app.use(compression());
// Used for production build
app.use(express.static(path.join(__dirname, 'public')));
routes(app);
// Routes
app.use('/api', require('./server/routes/api'));
app.all('/*', function(req, res) {
res.sendFile(path.join(__dirname, 'public/index.html'));
});
// Start server
app.listen(PORT, function() {
console.log('Server ' + cluster.worker.id + ' running on ' + PORT);
});
}

Handling the import:
Great question, from my experience by far the fastest way to insert a csv into mongo is via the command line:
mongoimport -d db_name -c collection_name --type csv --file file.csv --headerline
I don't believe mongoose has a way of calling mongoimport (someone correct me if I'm wrong)
But it's simple enough to call via node directly:
var exec = require('child_process').exec;
var cmd = 'mongoimport -d db_name -c collection_name --type csv --file file.csv --headerline';
exec(cmd, function(error, stdout, stderr) {
// do whatever you need during the callback
});
The above will have to be modified to be dynamic, but it should be self-explanatory.
Handling the upload:
Uploading the file from a front-end client is another challenge.
Most browsers will timeout if you make a request to a server and don't get a response within 60 seconds (probably what you are referring to above)
One solution would be to open a socket connection (search for socket.io in npm) for details. This will create a constant connection to the server and won't be subject to the timeout restrictions.
If uploading is not an issue, and the timeout is due to the slow parsing/inserting then you may not have to worry about this once you implement the above.
Other considerations:
I'm not sure exactly what you need to send back to the user, or what parsing needs to take place. But that can either be done outside of the normal request/response cycle, or can be handled during a socket connection if it's needed during one request/response cycle.

Related

Express JS way to clear memory with out shutting down server?

I currently have an express js server running. The server has 4 gigs of memory. Not the whole server is allocated to run express so it shares with other processes.
server.js code is:
// server.js
var webshot = require('./lib/webshot');
var fs = require('fs');
var http = require('http');
var bodyParser = require('body-parser');
var express = require('express');
var app = express();
app.use( bodyParser.urlencoded() );
// your express configuration here
var httpServer = http.createServer(app);
// For http
httpServer.listen(8080);
app.post('/', function (req, res) {
console.log(req.body);
var firstLine = req.body.firstLine;
var secondLine = req.body.secondLine;
var previewID = req.body.previewId;
var productPlate = req.body.prodName;
res.header('Access-Control-Allow-Origin', 'https://citylocs.com');
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT');
res.header('Access-Control-Allow-Headers', 'X-Requested-With, Content-Type');
takeWebshot(firstLine, secondLine, previewID, productPlate)
.then(() => console.log("a process exit here kills the server"))
res.end()
});
function takeWebshot(fLine, sLine, pID, prodPlate) {
var options = {
takeShotOnCallback: true,
captureSelector: '#img_preview_fancybox',
licensePlate: 'Guzman Plate'
};
return new Promise((resolve, reject) => {
webshot('example.com/preview/productpreview/testy.html?prod=' + prodPlate, '../screenshot/' + pID +'.png', options, function(err) {
if(!err) {
resolve();
} else {
reject(err);
}
})
})
const used = process.memoryUsage();
console.log(used);
};
In the above webshot goes to a url takes a screenshot and puts it in a file in my server.
I get the following results:
{ rss: 29876224,
heapTotal: 18694144,
heapUsed: 12250200,
external: 390493 }
the memoryUsage usually returns about the same on multiple runs.
The server takes a request and then executes the code runs takewebshot() and then it completes. My problem is after about 20 minutes of this the server crashes and has to be restarted, it runs out of memory and gives me an error. Is there anything I can add to "clear memory" cache? Quitting the process is not a solution cause then I have to restart the server.js file again. I would like to avoid a reset of server.js.

Node js - how to handle multiple asynchronous tasks

I'm new to node and got stuck with handling multiple async tasks.
Except from node, I've got another server (S1) which doesn't return data immediately to requests, it can returns multiple types of data and also can send notifications without requesting them specifically, so node have to listen to data from it , parse it and act accordingly.
The connection to this server (S1) is done by using:
S1 = net.createConnection({'host':S1Host, 'port': S1Port});
And node listens to data with:
S1.on('data', function(data){
S1DataParse(data);
});
I have to route the correct data (after parsing it) to a specific POST request.
app.post('/GetFooFromS1', function(req, res){
// Send request to S1
S1.write({'type':'foo'});
// If got the correct data sometime in the future, send response to the browser
res.setHeader('Content-Type', 'application/json');
res.json({'status':'success', 'value':S1FooData});
});
I tried to use the async module for that, but with no success.
What I was trying to do:
var asyncTasks = [];
app.post('/GetFooFromS1', function(req, res){
asyncTasks.push(function(callback){
// Send request to S1
S1.write({'type':'foo'});
});
async.parallel(asyncTasks, function(response){
res.setHeader('Content-Type', 'application/json');
res.json({'status':'success', 'value':response});
});
});
and another task in S1DataParse:
function S1DataParse(){
if(data.type='foo'){
asyncTasks.push(function(callback){
callback(data);
});
}
}
But, of course, the second task never added to the asyncTasks array. I really got stuck with that.
Can you please help me with that?
Thanks
-=-=-=- Edit -=-=-=-
Eventually, I came accross with events and EventEmitter().
From the POST request I call the function that sends requests to the data server (DataServerClientGet).
In this function I register a listener which will get the future data.
eventEmitter.on('getData', returnDataServerData);
It all works great except for one thing. Whenever I refresh the page or add other POST requests, I get an error:
Error: Can't set headers after they are sent.
It would be great if I solve this problem. Help me, please.
Thanks ;)
The whole code looks like this:
var express = require('express');
var app = express();
var http = require('http').Server(app);
var bodyParser = require('body-parser')
var net = require('net');
var events = require('events');
var dataServerHost = '127.0.0.1';
var dataServerPort = 12345;
var dataServerClient;
var logMsg;
var eventEmitter = new events.EventEmitter();
/*******************************************/
// Init
/*******************************************/
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: true}));
app.use(express.static(__dirname + '/public'));
/*******************************************/
// Connect to the data server
/*******************************************/
DataServerConnect();
/*******************************************/
// Open listener on port 3000 (to browser)
/*******************************************/
http.listen(3000, function(){
logMsg = 'listening on *:3000';
console.log(logMsg);
});
/*******************************************/
// Routing
/*******************************************/
app.get('/', function(req, res){
res.sendFile(__dirname + '/index.html');
});
app.post('/GetDataFoo', function(req, res){
var msg;
var size;
msg ='\n{"Type":"Query", "SubType":"GetDataFoo","SearchFilter":""}';
size = msg.length;
logMsg = 'Client to DataServer: GetDataFoo';
console.log(logMsg);
DataServerClientGet('GetDataFoo', size, msg, res);
});
/*******************************************/
// Functions
/*******************************************/
function DataServerConnect(){
dataServerClient = net.createConnection({'host':dataServerHost, 'port': dataServerPort}, function(){
logMsg = 'Connected to DataServer ['+dataServerHost+':'+dataServerPort+']';
console.log(logMsg);
});
dataServerClient.on('data', function(data){
logMsg = 'DataServerData>>>\n'+data.toString()+'DataServerData<<<';
console.log(logMsg);
DataServerDataParse(data.toString());
});
dataServerClient.on('end', function(){
logMsg = 'Disconnected from DataServer';
console.log(logMsg);
});
}
function DataServerClientGet(type, size, msg, res){
dataServerClient.write('Type: Json\nSize: '+size+'\n\n'+msg, function(err){
var returnDataServerData = function returnDataServerData(results){
res.setHeader('Content-Type', 'application/json');
res.json({'status':'success', 'value':results});
}
eventEmitter.on('getData', returnDataServerData);
}
function DataServerDataParse(json){
if(json.Type=='GetDataFoo')
{
var MessageList = json.MessageList;
eventEmitter.emit('getData', MessageList);
}
}
-=-=-=- Edit -=-=-=-
The Error: Can't set headers after they are sent. caused by adding the same listener of the same type each time the DataServerClientGet was called and the res was sending multiple times.
I solved this one by adding: removeListener(event, listener)
right after the res, inside the function. Anyway, I think it's wrong and can cause problems if there will be multiple calling to DataServerClientGet with the same type etc.
There is a optional callback parameter that you can pass to write function(docs), something like :
S1.write({'type':'foo'},function(err){
if(err){
//Handle error
}else{
res.setHeader('Content-Type', 'application/json');
res.json({'status':'success', 'value':response});
}
})
This can work with post route , but in your 'data' listener ,you cant send data from server to client when there is not connection initialized by client (it is not bidireccional ) if you want bidireccional behavior you can check socket.io

How to output mongodb collections in nodejs app to get them in response

I am using Cloude 9 environment for developing my nodejs app. In that I have written code to connect to mongodb database. I am successfully connecting to database and adding record to collection.
Now I want to send the collection info in return. But using res.send(collectionInfo); is not working.
Let me know how should I achieve this
Below is the code of my server.js file
var Db = require('mongodb').Db;
var http = require('http');
var path = require('path');
var async = require('async');
var socketio = require('socket.io');
var express = require('express');
var ejs = require('ejs');
var app = express();
var helpers = require('express-helpers')
var MongoClient = require('mongodb').MongoClient;
var Server = require('mongodb').Server;
var db;
helpers(app);
var bodyParser = require('body-parser');
app.use(bodyParser.json()); // for parsing application/json
app.use(bodyParser.urlencoded({extended: true})); // for parsing application/x-www-form-urlencoded
var server = http.Server(app);
server.listen(process.env.PORT || 3000, process.env.IP || "0.0.0.0", function () {
var addr = server.address();
console.log("Chat server listening at", addr.address + ":" + addr.port);
});
app.use(express.static(__dirname + '/public'));
app.set('views', __dirname + '/public/views');
app.engine('html', require('ejs').renderFile);
app.set('view engine', 'html');
//app.use(express.static(__dirname + '/client'));
app.use(express.static(path.join(__dirname, '/client')));
// MongoDB Connection
app.use(function(req, res, next) {
next();
})
app.get('/monogdb', function (req, res) {
res.render('monogdb.ejs');
});
app.post('/ajax-mongo-connect', function (req, res) {
var mongoClient = new MongoClient(new Server('localhost', 27017));
mongoClient.open(function(err, mongoClient) {
if(err){
console.log(err);
}else{
var db = mongoClient.db("mydb");
db.createCollection("students", { name : req.body.nm, description : req.body.desc, location : req.body.loc } );
console.log('database connected',db);
var collectionInfo = db.collection("students");
mongoClient.close();
//res.setHeader('Content-Type', 'application/json');
res.send(collectionInfo);
}
})
})
As per #Roman Sachenko answer, I have tried to use
res.send(collectionInfo.toJSON()); but it is giving below error
/home/ubuntu/workspace/node_modules/mongodb/lib/mongodb/db.js:299
throw err;
^
TypeError: Object #<Collection> has no method 'toJSON'
at /home/ubuntu/workspace/server.js:66:41
at MongoClient.open
(/home/ubuntu/workspace/node_modules/mongodb/lib/mongodb/mongo_client.js:103:5)
at Db.open (/home/ubuntu/workspace/node_modules/mongodb/lib/mongodb/db.js:296:11)
at process._tickCallback (node.js:442:13)
and using res.send({data: collectionInfo}); gives error
home/ubuntu/workspace/node_modules/mongodb/lib/mongodb/db.js:299
throw err;
^
TypeError: Converting circular structure to JSON
at Object.stringify (native)
at ServerResponse.res.json (/home/ubuntu/workspace/node_modules/express/lib/response.js:185:19)
at ServerResponse.res.send (/home/ubuntu/workspace/node_modules/express/lib/response.js:117:21)
at /home/ubuntu/workspace/server.js:67:21
at MongoClient.open (/home/ubuntu/workspace/node_modules/mongodb/lib/mongodb/mongo_client.js:103:5)
at Db.open (/home/ubuntu/workspace/node_modules/mongodb/lib/mongodb/db.js:296:11)
at process._tickCallback (node.js:442:13)
Try to return this: res.status(200).json({'myCollection' : collectionInfo});.
You can find more details about express response here
Update:
After you explain the details, take a look at the code below:
app.post('/ajax-mongo-connect', function (req, res) {
var mongoClient = new MongoClient(new Server('localhost', 27017));
mongoClient.open(function(err, mongoClient) {
if(err){
console.log(err);
res.status(500).json({message : 'OMG, an error occurred'});
}else{
var db = mongoClient.db("mydb");
db.createCollection("students", { name : req.body.nm, description : req.body.desc, location : req.body.loc } );
console.log('database connected',db);
var collectionInfo = db.collection("students");
// Here we will find all students
collectionInfo.find({}).toArray(function(err, students) {
// so now, we can return all students to the screen.
res.status(200).json({'myCollection' : students});
}
}
})
})
Cheers!
Mongoose ODM
First of all I would like to recommend you using Mongoose ODM:
https://github.com/Automattic/mongoose
So you will make you work with database much easier.
Basically it returns (Mongoose) normal object as results, but in case of issues you may try to use toObject() or toJSON() or as it mentioned create own object like {data: mongoCollection}
Examples:
res.send(collectionInfo.toObject());
res.send(collectionInfo.toJSON());
res.send({data: collectionInfo});
Please refer to the link in case of questions:
http://mongoosejs.com/docs/guide.html#toJSON
Native Driver
As for native driver, it also should return normally-constructed object, but according to issues I faced with in the past, JSON.stringify always helps if you set headers manually.
You may also check the contents of your entity. So you can just output it by console.log(collectionInfo);
Then just make sure that there is correct object inside.
And according to results you can take actions like:
res.send(JSON.stringify(collectionInfo)) //set headers manually
res.json(JSON.stringify(collectionInfo)) //you don't need to set headers
At least you will know what exactly is inside of collectionInfo. I think it will be enough to investigate the issue.
You can view circular JSON objects by doing this in node.js:
const util = require('util') // Native node module
util.inspect(circularObj)
You can call it from anywhere in the code, so it's very versatile.

how to add to add socket.io subscribers in node.js without rerunning setup code

Now edited with solution
I'm writing a web page, backed by Node and Express, which uses the twitter streaming API to plot the location of tweets containing a given word onto a Google map. The page connects to the server and places each tweet with location that the server emits onto the map.
the node js application is this. twitFactory just gives me a twitter object from the Twitter npm module with my various connection strings.
'use strict';
var express = require('express');
var app = express();
var server = require('http').Server(app);
var io = require('socket.io')(server);
var twitFactory = require('./lib/twitFactory');
var twit = twitFactory.twit();
var searchTerm = process.argv[2];
var locations = '-180,-90,180,90';
server.listen(3000);
app.use('/bower_components', express.static(__dirname + '/bower_components'));
app.get('/', function (req, res) {
res.sendFile(__dirname + '/index.html');
});
io.on('connection', function (socket) {
twit.stream('filter', {locations: locations}, function (stream) {
stream.on('data', function (tweet) {
if (tweet.text && tweet.text.toLowerCase().indexOf(searchTerm.toLowerCase()) != -1) {
if (tweet.text && tweet.coordinates && tweet.coordinates.coordinates && tweet.coordinates.type === 'Point') {
socket.emit('tweet', { text: tweet.text, lon: tweet.coordinates.coordinates[0], lat: tweet.coordinates.coordinates[1] });
}
}
});
});
});
I have this working for a single web client, which is fine for my development environment, but I want to avoid re-initialising the Twitter connection inside the on connection event every time a new client connects, or I would quickly hit my API call limit.
I think I want to define the twitter stream separately, and then add new connections to the socket's subscribers, but can't work out how to code this, without the nested structure you see above.
(I realise that it's very inefficient to get all tweets with location then filter by content, but very few tweets have location attached, and the API does not currently let you specify location AND search term, it gives you all tweets with either.)
UPDATE:
I found a solution which was to use a different Twitter Node module which allows the stream object and events to be declared separately. it's twit (as opposed to twitter) in npm. the code now looks like this:
'use strict';
var express = require('express');
var app = express();
var server = require('http').Server(app);
var io = require('socket.io')(server);
var logger = require('./lib/logger');
var twitFactory = require('./lib/twitFactory');
var searchTerm = process.argv[2];
var locations = ['-180','-90','180','90'];
var twit = twitFactory.twit();
var tweetStream = twit.stream('statuses/filter', {locations: locations});
server.listen(3000);
app.use('/bower_components', express.static(__dirname + '/bower_components'));
app.get('/', function (req, res) {
res.sendFile(__dirname + '/index.html');
});
io.on('connection', function (socket) {
console.log('new connection');
tweetStream.on('tweet', function (tweet) {
if (tweet.text && tweet.text.toLowerCase().indexOf(searchTerm.toLowerCase()) != -1) {
if (tweet.text && tweet.coordinates && tweet.coordinates.coordinates && tweet.coordinates.type === 'Point') {
socket.emit('tweet', { text: tweet.text, lon: tweet.coordinates.coordinates[0], lat: tweet.coordinates.coordinates[1] });
}
}
});
});

Adding Socket.io to an Express 4 Server - multiple file setup

I've been bouncing back and forth between socket.io and express.io - but settled for socket.io with Express 4, as I would like to use Namespaces.
I have worked on some examples of having an Express 4 Server using Socket.io - but most examples are based on one file with everything in it. I am trying to separate all my code to make it easier but I am at a loss as to how to add Socket.io (or where).
I have index.js which uses Cluster and basically calls server.js:
var server = require( "./server.js" );
var cluster = require('cluster');
var webApp={
run: function(){
console.log('Starting: Server');
server.listen();
}
};
if(cluster.isMaster){
cluster.fork();
cluster.on('exit',function(worker){
console.log('Worker ' + worker.id + ' died..');
setTimeout( function () { cluster.fork(); }, 1000 );
});
} else{
try {
webApp.run();
}
catch(e)
{
console.log(e);
process.exit(1);
}
process.on('uncaughtException', function(err){
console.log(err);
process.exit(1);
});
process.on( 'SIGINT', function () {
console.log( "\n SIGINT (Crtl-C)" );
//Kill worker
cluster.disconnect();
process.exit(1);
});
}
This then calls the server.js file:
var path = require('path');
var express = require('express');
var bodyParser = require('body-parser');
var config = require('./config/config.js');
var router = require('./routes');
var Server = Object.subClass({
/**
* Constructor
*/
init:function(){
this.appServer = express();
var that = this;
var appServer = this.appServer;
appServer.use(express.static(__dirname + '/public'));
appServer.set('views', path.join(__dirname, 'views'));
appServer.set('view engine', 'ejs');
appServer.use(bodyParser.urlencoded({ extended: true }));
appServer.use(bodyParser.json());
appServer.get('/',router.root);
},
/**
* Listener HTTP
*/
listen:function(){
var port = config.rest.port;
console.log(':: on port:' + port);
this.appServer.listen(port);
}
});
module.exports = new Server();
I am only having one 'route', which is the '/' and is defined in routes.js file. The page loads fine but where do I add the server side socket.io? and do I add any socket.io namespace definitions in the routes.js file or in the javascript of the page being loaded?
There are so many ways of using sockets that I can't seem to work out the best approach for my multi-file approach.
Any help would be brilliant as I seem to be going in circles.
Enjoy our Saturday :)
Thanks again.
I've spent the morning looking at the Cluster/Worker approach and decided to use 'SocketCluster' as it seems to do what I need.
Enjoy your Sunday

Resources