I wrote a script in Node that iterates over a large MongoDB collection, returning a certain number of documents at a time.
The collection has this simple format:
{
name: 'One',
data: '...'
},
{
name: 'Two',
data: '...'
},
...
I'm doing this job with the Q library, using a sequence of promises that get run one after the other:
'use strict';
var Q = require('q');
var monk = require('monk');
var CHUNK_SIZE = 100;
var LIMIT = 1000;
var collection = monk('localhost/dictionary').get('entries');
var promiseFactory = function (j) {
return function (result) {
if (undefined !== result) { // if result is undefined, we are at the first or last iteration.
if (result.length) {
for (var k = 0, max = result.length; k < max; k++) {
console.log(result[k].name); // print name
// ... do something with the document here...
}
} else { // no more documents, end of the iteration
return; // implicitely returns undefined
}
}
// returns CHUNK_SIZE documents, starting from the j-th document
return collection.find({}, { limit: CHUNK_SIZE, skip: j, sort: { name: 1 }});
};
};
var funcs = [];
for (var i = CHUNK_SIZE; i <= LIMIT; i += CHUNK_SIZE) {
funcs.push(promiseFactory(i));
}
var loop = Q.fcall(promiseFactory(0));
funcs.forEach(function (f) {
loop = loop.then(f);
});
The script works well and does achieve what it was designed to do.
However, I would like to improve it:
I'm hardcoding the number of documents in the collection (LIMIT). I would like to get rid of this variable and let the script detect when to stop.
I have a feeling that this approach may not be the most memory-efficient one. In my code, funcs.forEach() chains a lot of copies of the same function in one shot (to be exact LIMIT/CHUNK_SIZE copies). Since I'm working on a very large collection, I was wondering if there's a way to chain a new function only if there are still documents left, while running through the collection.
I think I found the solution to both problems. It is just a simple addition in promiseFactory() which I have highlighted below. Adding it here in the hope it is useful to someone:
var promiseFactory = function (j) {
return function (result) {
if (undefined !== result) { // if result is undefined, we are at the first or last iteration.
if (result.length) {
for (var k = 0, max = result.length; k < max; k++) {
console.log(result[k].en + ' - ' + result[k].le);
}
} else { // no more entries, end of the iteration
return; // implicitely returns undefined
}
}
///////////////// CHANGE HERE ////////////////////////
return entries.find({}, { limit: CHUNK_SIZE, skip: j, sort: { en: 1 }}).then(promiseFactory(j + CHUNK_SIZE));
///////////////////// END ////////////////////////////
};
};
Related
I am updating database fields by using setTimeout(). So when the updates are multiple what happens is the the last primary key is used for all the updates. How do I run the setTimeout() function sequentially. Below is the portion of code which does that.
for( var i = 0; i < req.body.devicelist.length; i++) { //running for loop for multiple elements
var data = JSONPARSE.toObject(req.body);
mac_id = req.body.devicelist[i];
data.mac_id = mac_id;
var gateway_config;
for (let j = 0; j < gateways_config.length; j++) { //code for fetching specific element. IGNORE
if(gateways_config[j].latest_config.mac_id == mac_id){
gateway_config = gateways_config[j]
break;
}
gateway_config = undefined
}
await syncConfig(req.body,gateway_config, req.decoded.id);
..........
..........
}
syncConfig(body,gateway,user_id){
var jsonObj = body;
...
...
...
config_timeout_array[jsonObj.mac_id] = setTimeout(() => { //Causing problem
commandTimeout(jsonObj.org_id,jsonObj.mac_id)
}, 10000);
...
...
}
commandTimeout:(org_id, mac_id) =>{
console.log(mac_id); //prints same mac_id (the last in the array)
return gateway_model.findOneAndUpdate({ org_id: org_id, mac_id: mac_id }, { 'sync_sent': false }, {"new": true})
.then((updated_gateway) => {
...
...
...
}
}
config_timeout_array[jsonObj.mac_id] = setTimeout(() => { //Causing problem
commandTimeout(jsonObj.org_id,jsonObj.mac_id)
}, 10000);
Instead of doing the above logic directly, call a function and do it there. I don't know why but it worked!
seqTimeout(jsonObj.org_id,jsonObj.mac_id); //function call
seqTimeout(org_id,mac_id){
config_timeout_array[mac_id] = setTimeout(() => {
GatewayController.commandTimeout(org_id, mac_id);
}, COMMAND_TIMEOUT);
}
Using some inspiration I got from this thread and reply I tried to get my loop working which is to write into firestore in batches. But somehow I only can only update 1 document even if I can see I iterate through different values from my array. I load data into an array and work from there.
const db = admin.firestore();
const jsonStream = StreamArray.withParser();
let arr = []
jsonStream.on('data', ({ key, value }) => {
arr.push(value);
});
jsonStream.on('end', () => {
var counter = 0;
var commitCounter = 0;
var batches = [];
arr.forEach((a, ind) => {
batches[commitCounter] = db.batch();
if (counter <= 498) {
var thisRef = db.collection('Testing').doc(a.id);
console.log("id")
console.log(a.id);
batches[commitCounter].set(thisRef, { ...a });
counter = counter + 1;
} else {
counter = 0;
commitCounter = commitCounter + 1;
batches[commitCounter] = db.batch();
}
})
for (var i = 0; i < batches.length; i++) {
if(i==0)
{
console.log(batches[0])
}
batches[i].commit().then(function () {
console.count('wrote batch');
});
}
});
const filename = path.join(__dirname, 'mydata.json');
fs.createReadStream(filename).pipe(jsonStream.input);
Following line gets executed on each iteration, which essentially "resets" your batch on each round:
batches[commitCounter] = db.batch();
So at the end each of your batches will only contain one document write.
So i ran into a problem. I don't know how to pass single string to the parental function from a child function and then pass that string as a response to the client side.
This whole thing gets five recent matches from API and then checks for a win or a loss depending on the player name.
Question 1: as i said before i don't know how to pass string from a child function to the parental function and then send it as a response to client side.
Question 2: the output of this should be WWWLW and how i think it should be ordered like that. But every time it outputs in different order like LWWWW WLWWW and so on... it has good arguments but different order and i am missing something here.
code:
var request = require('request');
app.get('/history',getmatches, getwins);
function getmatches(req, res, next){
var match = {};
request({
url: "https://eun1.api.riotgames.com/lol/match/v3/matchlists/by-account/"+ID+"/recent?api_key=" + key,
json: true
}, function (error, res) {
if (!error && res.statusCode === 200) {
for(var i=0; i < 5; i++){ //getting ID's of five last matches
match[i] = res.body.matches[i].gameId;
}
req.somevariable = match;
next();
}
}
);
};
function getwins(req, res, callback){
var match = req.somevariable;
var streak = '';
var pending = 0;
for( i = 0; i < 5; i++){ // passing ID's to another api link to get single match data
request({
url: "https://eun1.api.riotgames.com/lol/match/v3/matches/"+match[i]+"?api_key=" + key,
json: true
}, function(req,res, body){
for(var j = 0; j < 10; j++){ //looping through 10 players in a match to find specific one
if(body.participantIdentities[j].player.summonerName == nickname){
if( body.participants[j].stats.win == true){
streak += 'W';
}else{
streak += 'L';
}
}
}
if(pending == 4){
console.log(streak); // need this to pass to parent function
return callback(null, streak); // is this something i need ?
}
pending++
});
}
// res streak string to client.js
};
There is solution to process all results when it done. The result variable have all results use any appropriate key instead of url;
function getwins(req, res, callback){
var match = req.somevariable;
var streak = '';
var pending = 0;
var results = {};
var total = 5;
for( i = 0; i < total; i++){ // passing ID's to another api link to get single match data
var url = "https://eun1.api.riotgames.com/lol/match/v3/matches/"+match[i]+"?api_key=" + key;
request({
url: url,
json: true
}, function(req,res, body){
for(var j = 0; j < 10; j++){ //looping through 10 players in a match to find specific one
if(body.participantIdentities[j].player.summonerName == nickname){
if( body.participants[j].stats.win == true){
streak += 'W';
}else{
streak += 'L';
}
}
}
console.log(streak); // need this to pass to parent function
results[url] = streak;
if( total == Object.keys(results).length ) {
// here all requests are done - do with all result what you need
console.log( results );
}
return callback(null, streak); // is this something i need ?
}
});
}
// res streak string to client.js
};
In NodeJS, I tried to create 2 object of a same class. However, these 2 object are always the same despite having different values. Here is the class.
function reading(){
var readingArr = [];
};
reading.prototype.dbValue = function(counter, limit, type, mIndex) {
db.data.find({ 'type': type }).limit(limit).sort({timestamp:-1}).skip(counter, function(err, docs){
readingArr = [];
if( docs != 'undefined' ){
for(var i=0; i<limit; i++){
readingArr.push(docs[i].measurement[mIndex].value.toFixed(2)); //2 Decimal Placet;
}
}
});
if(typeof readingArr == 'undefined'){
readingArr = [];
}
return readingArr;
};
Here is the object creation.
var spo2 = new reading();
var spo2Arr = spo2.dbValue(0, 5, 'Oximeter', 1);
var temp1 = new reading();
var temp1Arr = temp1.dbValue(0, 5, 'Temperature', 0);
Both spo2Arr and temp1Arr return the same value despite having different value in the database. Example
spo2Arr: 98.00
temp1Arr: 98.00
spo2Arr: 37.91
temp1Arr 37.91
May I know how to create two unique object in NodeJS?
You're performing an asynchronous function call which is not going to complete until some time after dbValue() has finished executing.
Try this:
reading.prototype.dbValue = function(counter, limit, type, mIndex, cb) {
db.data.find({ 'type': type }).limit(limit).sort({timestamp:-1}).skip(counter, function(err, docs){
if (err)
return cb(err);
var readingArr = [];
if (docs !== undefined) {
for (var i = 0; i < limit; i++)
readingArr.push(docs[i].measurement[mIndex].value.toFixed(2));
}
cb(null, readingArr);
});
};
Then you might use it like:
var spo2 = new reading();
spo2.dbValue(0, 5, 'Oximeter', 1, function(err, spo2Arr) {
// check for `err`, if it's falsey, use `spo2Arr`
});
var temp1 = new reading();
temp1.dbValue(0, 5, 'Temperature', 0, function(err, temp1Arr) {
// check for `err`, if it's falsey, use `temp1Arr`
});
If the temperature readings depend on the oximeter readings, you'll have to mode the temperature reading code inside the oximeter reading callback or you can use a module like async to help structure your control flow.
I am pulling information from some collections in mongo that contain node and edge data. First i must get the node so i can grab its edges. Once i have a list of edges i then go back out and grab more nodes (etc.. based on a depth value). The following code is an loose example of how i am attempting to use async.waterfall and the task list.
Initially i have only a single task but once i make my first query i add to the task array. Unfortunately this does not seem to register with async and it does not continue to process the tasks i am adding.
Is there a better way to do this?
var async = require('async')
var mongoose = require('mongoose')
var _ = requrie('underscore')
var Client = this.Mongo.connect(/*path to mongo*/)
var Node = mongoose.Schema({
id : String,
graph_id : String
})
var Edge = mongoose.Schema({
id : String,
source_id : String,
destination_id : String
})
var Nodes = Client.model('myNode', Node)
var Edges = Client.model('myEdge', Edge)
var funcs = []
var round = 1
var depth = 2
var query = {
node : {
id : '12345'
},
edge : {
id : '12345'
}
}
var addTask = function(Nodes, Edges, query, round, depth) {
return function(callback) {
queryData(Nodes, Edges, query, function(err, node_list) {
if(depth > round) {
round++
function_array.push(addTask(Nodes, Edges, query, round, depth))
}
})
}
}
var queryData = function(Nodes, Edges, query, cb) {
async.waterfall([
function(callback) {
Nodes.find(query.node, function(err, nodes) {
var node_keys = _.map(nodes, function(node) {
return node.id
})
callback(null, nodes, node_keys)
})
},
function(nodes, node_keys, callback) {
query.edge.$or = [ {'source_id' : {$in:node_keys}}, {'destination_id' : {$in:node_keys}} ]
Edges.find(query.edge, function(err, edges) {
var edge_keys = _.map(edges, function(edge) {
if(edge['_doc']['source_id'] != query.node.id) {
return edge['_doc']['source_id']
} else {
return edge['_doc']['destination_id']
}
callback(null, nodes, edges, node_keys, edge_keys)
})
})
}
], function(err, nodes, edges, node_keys, edge_keys) {
// update the results object then...
cb(null, _.uniq(edge_keys)
})
}
var function_array = []
function_array.push(addTask(Nodes, Edges, query, round, depth))
async.waterfall(function_array, function(err) {
Client.disconnect()
//this should have run more than just the initial task but does not
})
--------------------- UPDATE ---------------------------
So after playing around with trying to get Async waterfall or series to do this by adding trailing functions I decided to switch to using async.whilst and am now happy with the solution.
function GraphObject() {
this.function_array = []
}
GraphObject.prototype.doStuff = function() {
this.function_array.push(this.buildFunction(100))
this.runTasks(function(err) {
console.log('done with all tasks')
}
}
GraphObject.prototype.buildFunction = function(times) {
return function(cb) {
if(times != 0) {
this.function_array.push(this.buildFunction(times - 1))
}
cb(null)
}
}
GraphObject.prototype.runTasks = function(cb) {
var tasks_run = 0
async.whilst(
function(){
return this.function_array.length > 0
}.bind(this),
function(callback) {
var func = this.function_array.shift()
func.call(this, function(err) {
tasks_run++
callback(err)
})
}.bind(this),
function(err) {
console.log('runTasks ran '+tasks_run+' tasks')
if(err) {
cb(500)
}
cb(null)
}.bind(this)
)
}
A task in your function_array can only add a new task to the array provided it is NOT the last task in the array.
In your case, your function_array contained only 1 task. That task itself cannot add additional tasks since it's the last task.
The solution is to have 2 tasks in the array. A startTask to bootstrap the process, and a finalTask that is more of a dummy task. In that case,
function_array = [startTask, finalTask];
Then startTask would add taskA, taskB will add task C and eventually
function_array = [startTask, taskA, taskB, taskC, finalTask];
The sample code below that illustrates the concepts.
var async = require('async');
var max = 6;
var nodeTask = function(taskId, value, callback){
var r = Math.floor(Math.random() * 20) + 1;
console.log("From Node Task %d: %d", taskId, r);
// add an edge task
if (taskId < max) {
function_array.splice(function_array.length-1, 0, edgeTask);
}
callback(null, taskId + 1, value + r);
};
var edgeTask = function(taskId, value, callback){
var r = Math.floor(Math.random() * 20) + 1;
console.log("From Edge Task %d: %d", taskId, r);
// add a node task
if (taskId < max) {
function_array.splice(function_array.length-1, 0, nodeTask);
}
callback(null, taskId + 1, value + r);
};
var startTask = function(callback) {
function_array.splice(function_array.length-1, 0, nodeTask);
callback(null, 1, 0);
};
var finalTask = function(taskId, value, callback) {
callback(null, value);
};
var function_array = [startTask, finalTask];
async.waterfall(function_array, function (err, result) {
console.log("Sum is ", result);
});