How to perform mass inserts into mongodb using NodeJS - node.js

I Have to Insert about 10,00000 documents in mongodb using nodejs.
I'm generating these documents using a for loop storing them into an array before finally inserting them into mongodb.
var codeArray = new Array();
for (var i = 0; i<1000000; i++){
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc1 = {id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
codeArray.push(doc1);
db.collection('ClPromoCodeMaster').insert(codeArray, function (err, result) {
if (err){
console.log(err);
}else{
console.log('Inserted Records - ', result.ops.length);
}
});
The problem I'm facing is mongo has an inserting limit of 16mb, so I can't insert the entire array at once.
Please suggest most optimum solutions.

The main problem is in the request size and not the document size, but it amounts to the same limitation. Bulk operations and the async library with async.whilst will handle this:
var bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp(),
i = 0;
async.whilst(
function() { return i < 1000000; },
function(callback) {
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc = {
id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
bulk.insert(doc);
i++;
// Drain every 1000
if ( i % 1000 == 0 ) {
bulk.execute(function(err,response){
bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp();
callback(err);
});
} else {
callback();
}
},
function(err) {
if (err) throw err;
console.log("done");
}
);
I should note that regardless there is an internal limit on bulk operations to 1000 operations per batch. You can submit in larger sizes, but the driver is just going to break these up and still submit in batches of 1000.
The 1000 is a good number to stay at though, since it is already in line with how the request will be handled, as well as being a reasonable number of things to hold in memory before draining the request queue and sending to the server.

For inserting millions of record at a time, Create node.js child process fork with MongoDb bulk api.
Child Process Creation:(index.js)
const {fork} = require("child_process");
let counter = 1;
function createProcess(data){
const worker = fork("./dbOperation");
worker.send(data);
worker.on("message", (msg) => {
console.log("Worker Message :",counter, msg);
counter++;
})
}
function bulkSaveUser(records) {
const singleBatchCount = 10000; // Save 10,000 records per hit
const noOfProcess = Math.ceil(records/singleBatchCount);
let data = {};
console.log("No of Process :", noOfProcess);
for(let index = 1; index <= noOfProcess; index++) {
data.startCount = (index == 1) ? index : (((index - 1) * singleBatchCount) + 1);
data.endCount = index * singleBatchCount;
createProcess(data);
}
}
bulkSaveUser(1500000);
DB Operation (dbOperation.js)
const MongoClient = require('mongodb').MongoClient;
// Collection Name
const collectionName = "";
// DB Connection String
const connString = "";
process.on("message", (msg) => {
console.log("Initialize Child Process", msg)
const {startCount, endCount} = msg;
inputStudents(startCount, endCount);
});
function initConnection() {
return new Promise(function(r, e) {
MongoClient.connect(connString, function(err, db) {
if (err) e(err)
r(db);
});
});
}
function inputStudents(startCount, endCount) {
let bulkData = [];
for(let index = startCount; index <= endCount; index++ ){
var types = ['exam', 'quiz', 'homework', 'homework'];
let scores = []
// and each class has 4 grades
for (j = 0; j < 4; j++) {
scores.push({'type':types[j],'score':Math.random()*100});
}
// there are 500 different classes that they can take
class_id = Math.floor(Math.random()*501); // get a class id between 0 and 500
record = {'student_id':index, 'scores':scores, 'class_id':class_id};
bulkData.push({ insertOne : { "document" : record } })
}
initConnection()
.then((db) => {
const studentDb = db.db("student");
const collection = studentDb.collection(colName)
console.log("Bulk Data :", bulkData.length);
collection.bulkWrite(bulkData, function(err, res) {
if (err) throw err;
//console.log("Connected Successfully",res);
process.send("Saved Successfully");
db.close();
});
})
.catch((err) => { console.log("Err :", err) });
}
Sample project to insert millions of record in mongodb using child process fork

Related

Firebase cloud functions realtime db parallel requests

I have an issue with handling parallel requests with cloud functions.
My scenario is to select a driver from the db and update its status.
I do check for that status property before updating it, but when I send multiple requests (database on create triggers to be specific) within a second it doesn't seem to read the updated status property. And it always updates with the information of the last request. I also have noticed that sometimes the requests are processed altogether.
What can I do to fix these issues?
index.js
const db = app.database();
const TripManagementUtil = require('./utils').TripManagementUtil;
exports.triggerNotifications = functions.database.ref('/Trip/{pushId}').onCreate( (snapshot, context) =>
{
var newTrip = snapshot.val();
var tripKey = context.params.pushId;
var tripManagementUtil = new TripManagementUtil();
tripManagementUtil.searchDrivers(tripKey, newTrip, db);
});
utils.js
searchDrivers(tripKey, trip, db){
const results = [];
var lat = trip.pickupLocation.lat, long = trip.pickupLocation.lng;
var vehicleTypeID = trip.vehicleTypeID;
var alreadyAssigned = trip.alreadyAssigned;
var self = this;
if(alreadyAssigned == null || alreadyAssigned == 'undefined'){
alreadyAssigned = [];
}
const geofireQuery = new GeoFire(db.ref('vehicleLocation').child(vehicleTypeID + "")).query({
center: [lat, long],
radius: constants.searchRadius
})
.on('key_entered', (key, coords, distance) => {
if(alreadyAssigned.indexOf(key) == -1){
var result = {
driverID: key,
distance: distance
}
results.push(result);
}
});
setTimeout(() => {
geofireQuery.cancel();
if (results.length === 0) {
self.noDriversHandler(alreadyAssigned, tripKey, db);
} else {
results.sort((a, b) => a.distance - b.distance);
var driversAvailable = false;
var index = 0;
function checkDriver(){
db.ref().child("driver").child("available").child(results[index].driverID).once('value').then(function(vehicleSnap){
var vehicle = vehicleSnap.val();
if(!driversAvailable){
if(vehicle != null && vehicle.vehicleTypeID == vehicleTypeID
&& (vehicle.tripStatus != TripVehicleActionEnum.DriverConfirmed && vehicle.tripStatus != TripVehicleActionEnum.VehicleAssigned)
&& alreadyAssigned.indexOf(vehicle.driverID +"") === -1){
driversAvailable = true;
self.driverExistsHandler(trip, tripKey, alreadyAssigned, vehicle, db);
}
if(!driversAvailable && index + 1 == results.length){
self.noDriversHandler(alreadyAssigned, tripKey, db);
}
index++;
}
else{
index++;
checkDriver();
}
});
}
checkDriver();
}
}, 1500);
}
To write data to the database where the value is based on an existing value, you'll want to use Firebase Realtime Database transactions. For more on this, and examples, see save data transactionally in the Firebase documentation.

node wait for iteration to complete before callback

I have a lambda function in node.js to send a push notification.
In that function I need to iterate through my users sending a notification for each one prior to the callback.
Ideally I would like the iteration to perform in parallel.
What would be the best way to do this?
My code is currently as follows but it does not work as expected because the last user is not always the last to be handled:
var apnProvider = new apn.Provider(options);
var iterationComplete = false;
for (var j = 0; j < users.length; j++) {
if (j === (users.length - 1)) {
iterationComplete = true;
}
var deviceToken = users[j].user_device_token;
var deviceBadge = users[j].user_badge_count;
var notification = new apn.Notification();
notification.alert = message;
notification.contentAvailable = 1;
notification.topic = "com.example.Example";
apnProvider.send(notification, [deviceToken]).then((response) => {
if (iterationComplete) {
context.succeed(event);
}
});
}
Use Promise.all instead - map each user's associated apnProvider.send call to a Promise in an array, and when all Promises in the array are resolved, call the callback:
const apnProvider = new apn.Provider(options);
const userPromises = users.map((user) => {
const deviceToken = user.user_device_token;
const deviceBadge = user.user_badge_count;
const notification = new apn.Notification();
notification.alert = message;
notification.contentAvailable = 1;
notification.topic = "com.example.Example";
return apnProvider.send(notification, [deviceToken]);
})
Promise.all(userPromises)
.then(() => {
context.succeed(event);
})
.catch(() => {
// handle errors
});

get name from id inside loop in node js

hello all I am new to node js and mongo db so I am facing a problem which is written above want to match an id from an array and get all the record of matching id , but thats not working i have already tried for loop but that is taking too much time so i am looking for some kind of query by which I pass id and get the matching results i have below function also where i can pass id and get the matching result but i don't know how to call that .
exports.getDistrictFromId = function(req, res, next) {
var distId = req.params.id;
districtslib.getDistricts({_id: utils.toObjectId(distId)}, function(err, district) {
if (err) {
return next(err);
}
req.store.district = district;
next();
});
};
Here is my code
exports.getCompleteTeachersList = function(req, res, next) {
var query = req.store.get('query');
var teacherQuery = {enabled: true};
var searchQuery = '';
if (!_.isUndefined(query)) {
// schoolQuery = {'name':new RegExp(query.replace(/([.?*+^$[\]\\(){}|-])/g, "\\$1"),'i')};
//{ $text: { $search: "amit hinduja"} }
//teacherQuery = {enabled: true,$or:[{firstName:new RegExp(query.replace(/([.?*+^$[\]\\(){}|-])/g, "\\$1"),'i')},{lastName:new RegExp(query.replace(/([.?*+^$[\]\\(){}|-])/g, "\\$1"),'i')}]};
if(query.trim() != '') {
teacherQuery = {enabled: true,$text: { $search: query} };
searchQuery = query;
}
}
teacherslib.getTeachers(teacherQuery, function(err, teachers) {
if (err) {
return next(err);
}
var schools = req.store.get('schools');
for(var i = 0; i < teachers.length; i++) {
teachers[i].schoolName = "";
for(var j = 0; j < schools.length; j++) {
if (teachers[i].schoolId.toString() === schools[j]._id.toString()) {
teachers[i].schoolName = schools[j].name;
teachers[i].distId = "";
var districts = req.store.get('districts');
console.log(schools[j].distId);
// i want to get the array of matching district id `schools[j].distId` from the district array from `var districts = req.store.get('districts');` this line
break;
}
}
}
req.store.set('searchQuery', searchQuery);
req.store.set('teachers', teachers);
//req.store.set('districts', districts);
next();
});
};
Collection structure is like this
1) distid is coming in schools collection
using distid get all the matching record from district
2)district array has countyid and from that county id has to get data from the county collection
Instead of looping inside a loop, i would suggest you look into the $lookup operator of the aggregation framework. Here you can perform the lookup server side.

Node.js Readable stream slows over time, CPU usage falls

I am trying to launch a cluster that will stream files (new line delimited JSON) from google cloud storage and transform each row after fetching data from MongoDB. After transforming the row, i want to store it in Google's bigquery - 10000 rows at a time. All of this is working fine but the issue is that the rate at which the streamed files are being processed decreases significantly over time.
I have setup the node application on one server and mongodb on another. Both 8 core machines with 30GB RAM. When the script is executed, initially the CPU usage for the application server and mongodb server is around 70%-75%. After 30 minutes, the CPU usage falls down to 10% and then finally 1%. The application generates no exceptions. I can see the application log and find that it finished processing a few files and took up new files for processing. One execution can be observered below a little later than 3:00PM and a almost upto 5:20PM.
var cluster = require('cluster'),
os = require('os'),
numCPUs = os.cpus().length,
async = require('async'),
fs = require('fs'),
google = require('googleapis'),
bigqueryV2 = google.bigquery('v2'),
gcs = require('#google-cloud/storage')({
projectId: 'someproject',
keyFilename: __dirname + '/auth.json'
}),
dataset = bigquery.dataset('somedataset'),
bucket = gcs.bucket('somebucket.appspot.com'),
JSONStream = require('JSONStream'),
Transform = require('stream').Transform,
MongoClient = require('mongodb').MongoClient,
mongoUrl = 'mongodb://localhost:27017/bigquery',
mDb,
groupA,
groupB;
var rows = [],
rowsLen = 0;
function transformer() {
var t = new Transform({ objectMode: true });
t._transform = function(row, encoding, cb) {
// Get some information from mongodb and attach it to the row
if (row) {
groupA.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.OA_SA': 1, _id: 0 }
}, function(err, a) {
if (err) return cb();
groupB.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.WZ11CD': 1, _id: 0 }
}, function(err, b) {
if (err) return cb();
row.groupA = a ? a.properties.OA_SA : null;
row.groupB = b ? b.properties.WZ11CD : null;
// cache processed rows in memory
rows[rowsLen++] = { json: row };
if (rowsLen >= 10000) {
// batch insert rows in bigquery table
// and free memory
log('inserting 10000')
insertRowsAsStream(rows.splice(0, 10000));
rowsLen = rows.length;
}
cb();
});
});
} else {
cb();
}
};
return t;
}
var log = function(str) {
console.log(str);
}
function insertRowsAsStream(rows, callback) {
bigqueryV2.tabledata.insertAll({
"projectId": 'someproject',
"datasetId": 'somedataset',
"tableId": 'sometable',
"resource": {
"kind": "bigquery#tableDataInsertAllRequest",
"rows": rows
}
}, function(err, res) {
if (res && res.insertErrors && res.insertErrors.length) {
console.log(res.insertErrors[0].errors)
err = err || new Error(JSON.stringify(res.insertErrors));
}
});
}
function startStream(fileName, cb) {
// stream a file from Google cloud storage
var file = bucket.file(fileName),
called = false;
log(`Processing file ${fileName}`);
file.createReadStream()
.on('data', noop)
.on('end', function() {
if (!called) {
called = true;
cb();
}
})
.pipe(JSONStream.parse())
.pipe(transformer())
.on('finish', function() {
log('transformation ended');
if (!called) {
called = true;
cb();
}
});
}
function processFiles(files, cpuIdentifier) {
if (files.length == 0) return;
var fn = [];
for (var i = 0; i < files.length; i++) {
fn.push(function(cb) {
startStream(files.pop(), cb);
});
}
// process 3 files in parallel
async.parallelLimit(fn, 3, function() {
log(`child process ${cpuIdentifier} completed the task`);
fs.appendFile(__dirname + '/complete_count.txt', '1');
});
}
if (cluster.isMaster) {
for (var ii = 0; ii < numCPUs; ii++) {
cluster.fork();
}
} else {
MongoClient.connect(mongoUrl, function(err, db) {
if (err) throw (err);
mDb = db;
groupA = mDb.collection('groupageo');
groupB = mDb.collection('groupbgeo');
processFiles(files, process.pid);
// `files` is an array of file names
// each file is in newline json delimited format
// ["1478854974993/000000000000.json","1478854974993/000000000001.json","1478854974993/000000000002.json","1478854974993/000000000003.json","1478854974993/000000000004.json","1478854974993/000000000005.json"]
});
}
Okay, I have found the culprit! Google APIs Node.js client library makes use of a module called "stream-events" which implements Streams 0.8. Streams 0.8 do not control the rate at which it emits the 'data' event based on the consumer's ability to consume data. The rate controlling feature was introduced in Streams 1.0. So this essentially meant that the readable stream was throwing data at MongoDB at a rate that it was not able to process.
Solution:
I used the 'request' module instead of Google's client library. I supplied a signed URL to the request module which in turn fetched results as a stream which I could pipe into my transformer.
Take Away:
Always check the modules you use for the stream versions they are using.

Is it possible to build a dynamic task list in nodejs Async (waterfall, series, etc...)

I am pulling information from some collections in mongo that contain node and edge data. First i must get the node so i can grab its edges. Once i have a list of edges i then go back out and grab more nodes (etc.. based on a depth value). The following code is an loose example of how i am attempting to use async.waterfall and the task list.
Initially i have only a single task but once i make my first query i add to the task array. Unfortunately this does not seem to register with async and it does not continue to process the tasks i am adding.
Is there a better way to do this?
var async = require('async')
var mongoose = require('mongoose')
var _ = requrie('underscore')
var Client = this.Mongo.connect(/*path to mongo*/)
var Node = mongoose.Schema({
id : String,
graph_id : String
})
var Edge = mongoose.Schema({
id : String,
source_id : String,
destination_id : String
})
var Nodes = Client.model('myNode', Node)
var Edges = Client.model('myEdge', Edge)
var funcs = []
var round = 1
var depth = 2
var query = {
node : {
id : '12345'
},
edge : {
id : '12345'
}
}
var addTask = function(Nodes, Edges, query, round, depth) {
return function(callback) {
queryData(Nodes, Edges, query, function(err, node_list) {
if(depth > round) {
round++
function_array.push(addTask(Nodes, Edges, query, round, depth))
}
})
}
}
var queryData = function(Nodes, Edges, query, cb) {
async.waterfall([
function(callback) {
Nodes.find(query.node, function(err, nodes) {
var node_keys = _.map(nodes, function(node) {
return node.id
})
callback(null, nodes, node_keys)
})
},
function(nodes, node_keys, callback) {
query.edge.$or = [ {'source_id' : {$in:node_keys}}, {'destination_id' : {$in:node_keys}} ]
Edges.find(query.edge, function(err, edges) {
var edge_keys = _.map(edges, function(edge) {
if(edge['_doc']['source_id'] != query.node.id) {
return edge['_doc']['source_id']
} else {
return edge['_doc']['destination_id']
}
callback(null, nodes, edges, node_keys, edge_keys)
})
})
}
], function(err, nodes, edges, node_keys, edge_keys) {
// update the results object then...
cb(null, _.uniq(edge_keys)
})
}
var function_array = []
function_array.push(addTask(Nodes, Edges, query, round, depth))
async.waterfall(function_array, function(err) {
Client.disconnect()
//this should have run more than just the initial task but does not
})
--------------------- UPDATE ---------------------------
So after playing around with trying to get Async waterfall or series to do this by adding trailing functions I decided to switch to using async.whilst and am now happy with the solution.
function GraphObject() {
this.function_array = []
}
GraphObject.prototype.doStuff = function() {
this.function_array.push(this.buildFunction(100))
this.runTasks(function(err) {
console.log('done with all tasks')
}
}
GraphObject.prototype.buildFunction = function(times) {
return function(cb) {
if(times != 0) {
this.function_array.push(this.buildFunction(times - 1))
}
cb(null)
}
}
GraphObject.prototype.runTasks = function(cb) {
var tasks_run = 0
async.whilst(
function(){
return this.function_array.length > 0
}.bind(this),
function(callback) {
var func = this.function_array.shift()
func.call(this, function(err) {
tasks_run++
callback(err)
})
}.bind(this),
function(err) {
console.log('runTasks ran '+tasks_run+' tasks')
if(err) {
cb(500)
}
cb(null)
}.bind(this)
)
}
A task in your function_array can only add a new task to the array provided it is NOT the last task in the array.
In your case, your function_array contained only 1 task. That task itself cannot add additional tasks since it's the last task.
The solution is to have 2 tasks in the array. A startTask to bootstrap the process, and a finalTask that is more of a dummy task. In that case,
function_array = [startTask, finalTask];
Then startTask would add taskA, taskB will add task C and eventually
function_array = [startTask, taskA, taskB, taskC, finalTask];
The sample code below that illustrates the concepts.
var async = require('async');
var max = 6;
var nodeTask = function(taskId, value, callback){
var r = Math.floor(Math.random() * 20) + 1;
console.log("From Node Task %d: %d", taskId, r);
// add an edge task
if (taskId < max) {
function_array.splice(function_array.length-1, 0, edgeTask);
}
callback(null, taskId + 1, value + r);
};
var edgeTask = function(taskId, value, callback){
var r = Math.floor(Math.random() * 20) + 1;
console.log("From Edge Task %d: %d", taskId, r);
// add a node task
if (taskId < max) {
function_array.splice(function_array.length-1, 0, nodeTask);
}
callback(null, taskId + 1, value + r);
};
var startTask = function(callback) {
function_array.splice(function_array.length-1, 0, nodeTask);
callback(null, 1, 0);
};
var finalTask = function(taskId, value, callback) {
callback(null, value);
};
var function_array = [startTask, finalTask];
async.waterfall(function_array, function (err, result) {
console.log("Sum is ", result);
});

Resources