I am on scraping actions on nodejs, I am using request to connect to site, cheerio to access to the data and mongodb to store the data extracted. Also I am using async.js to avoid infinite recursion.
I have got a memory problem because my process takes memory and do not free it. I think that the problem is on mongodb because if I don't use mongodb the memory remains stable.
This is my summarized code:
// Use function scrape_urls to process the urls
var q = self.asyn.queue(scrape_urls, 3);
//I push a bunch of urls ...
for (var j = 0; j < self.urls_data.length; j++) {
q.push(self.urls_data[j]);
}
q.drain = function () {
console.log("END");
};
function scrape_urls(data_url, next_action) {
request({
method: 'GET',
url: data_url.url
}, function (err, response, body) {
var $ = cheerio.load(body);
data = { // ... scraped data ... };
mongo_client.connect(connection_string, function (err, db) {
if (err) { return console.dir(err); }
var collection = db.collection('foo');
collection.insert(data);
next_action();
});
});
};
As I say, if I avoid to use mongodb and only I connect to the urls using request, the memory will not grow endless, I think that connecting to mongodb is the problem.
Any ideas?
Problem solved.
I leave here a solution. I made a helper to reuse the connection and maintain only one (after all, nodejs is single-thread):
var MongoDbHelper = function (mongo_client, connection_string){
var self = this;
this.mongo_client = mongo_client;
this.connection_string = connection_string;
this.db = undefined;
self.log = function (thread, str)
{
console.log(new Date().toISOString() + ' ' + process.memoryUsage().rss + ' [' + thread + '] ' + str);
}
self.getcollection = function(collection_name, callback)
{
var collection = null;
try
{
collection = self.db.collection(collection_name);
}
catch(ex)
{
self.db = undefined;
}
// reconnecting if the connection is lost
if(self.db == undefined)
{
self.mongo_client.connect(connection_string, function(err, db) {
self.db = db;
var collection = self.db.collection(collection_name);
callback(err, self.db, collection);
});
}
else
{
callback(null, self.db, collection);
}
}
};
module.exports = MongoDbHelper
Related
I'm guessing this is related to not understanding promises and execution order, but I'm currently stumped why this Firebase Function (repackaged Google Cloud Functions) code runs recursively.
Currently the function executes once successfully (fetches data, writes database entry, writes file in storage), and then repeats every 15-30 seconds until it reaches the '402' error state. It is intended to only execute once.
Any help would be appreciated.
exports.add = functions.https.onRequest((req, res) => {
cors(req, res, () => {
if (req.query.idToken) {
// there's a query param
var idToken = req.query.idToken;
admin.auth().verifyIdToken(idToken)
.then(function(decodedToken) {
var uid = decodedToken.uid;
var userRef = database.ref('users/' + uid);
var feedCountRef = database.ref('users/' + uid).child('feeds');
var plansRef = database.ref('plans')
userRef.once('value', function(snapshot){
var feedsCount = snapshot.val().feeds;
var currentPlan = snapshot.val().membership;
var planRef = database.ref('plans/' + currentPlan);
planRef.once('value', function(snapshot) {
console.log(snapshot.val());
var allowedFeeds = snapshot.val().feeds;
if(feedsCount < allowedFeeds) {
fetchFeed(req.body.feedSource, function(feedData) {
var defaultFeedName = 'Untitled';
var defaultUpdateFrequency = 'Weekly';
var feedsdatabaseRef = database.ref('feeds/' + uid);
var newFeedDatabaseRef = feedsdatabaseRef.push();
var feedKey = newFeedDatabaseRef.key;
writeFeedStorage(feedKey, feedData, function(response) {
console.log(response);
newFeedDatabaseRef.set({
// write data
})
});
feedCountRef.transaction(function(feeds){
return (feeds || 0) + 1;
});
return;
});
} else {
console.log('over quota');
res.status(402).send({error: 'You are at the maximum number of feeds your plan allows.'});
}
});
})
}).catch(function(error) {
res.status(401);
});
} else {
res.status(401);
}
})
})
From your code snippet, a potential reason that it would be running repeatedly is that you are not returning an ok status if things worked out correctly, e.g.
res.status(200).send('ok');
According to the Firebase documentation, this is something you should be doing for HTTP Functions.
This is my first personal project in Nodejs. I'm trying to get in live soon.
I have a Nodejs server that uses sqlite3. There are only 3000 rows with word, transform and a precalculated value each in a column of the table, which is already populated.
I need to just lookup the word in the DB to be sure it is valid.
var sqlite3 = require("sqlite3").verbose();
var db = new sqlite3.Database("validate.db");
db.get("SELECT * FROM tab WHERE w = ?", word, function(err, row) {
if(err) { console.log("Lookup:",word,", Error => ",err); return false; }
return true;
});
The problem is that the caller of this code has a lot of context and need the operation to wait. So, I tried this
function dbLookup(db, w) {
return function(cb) {
var rows = [];
db.exec('SELECT w FROM tab WHERE w = "'+w+'"')
.on('row', function(r) {
rows.push(r)
})
.on('result', function() {
cb(rows);
});
}
async.each([word], function(w) {
dbLookup(this.db, w);
}, function(err) {
if(err) {console.log("...ERROR..."); return false; }
else {console.log("...SUCCESS..."); return true; }
});
This doesn't solve the wait issue as the callback can fire at its own pace.
I read that promise using something like bluebird can solve my problem
but now I'm not able to get the value/result of the query out:
I've been pulling my hair for so long. Please help me either get the async working or get the result back from the promise approach.
var async = require('async');
var sqlite3 = require("sqlite3").verbose();
var db = new sqlite3.Database("validate.db");
function check(word, callback) {
db.get("SELECT count(1) cnt FROM tab WHERE w = ?", word, callback)
}
async.map(words, check, function(err, results) {
if (err)
return console.log('Query error')
var all_checked = results.filter(function(r) {
return r.cnt > 0
});
...
});
Or
var sqlite3 = require("sqlite3").verbose();
var db = new sqlite3.Database("validate.db");
db.all("SELECT distinct w FROM tab", function(err, rows) {
var all_checked = words.filter(function (w) {
return rows.indexOf(w) != -1;
})
...
})
Considering that my server.js looks almost like this. Just send you the relevant part. I did not receive anything from the query, I do have data in the database, and "sendNotification" is triggered by the jQuery function in the client. Everything works and since var notis = []; returns an empty value and is what is shows as response. I know I have to debug SQL and that's what I'm going to do but anyway want to be sure of this other things. So my questions are:
1) Is a right syntax for node.js, considering this async behavior? (which I still don't understand )
2) The query always should be inside of the "io.sockets.on('connection')" part?
connection = mysql.createConnection({
host: 'localhost',
user: '',
password: "",
database: 'table' //put your database name
}),
...
connection.connect(function(err) {
// connected! (unless `err` is set)
console.log(err);
});
…
var sqlquery = function(uID,vs){
var notis = [];
connection.query("SELECT * FROM notification WHERE kid = ? AND v = ? ORDER BY id DESC",[uID,vs])
.on("result", function (data){
return notis.push(data);
});
};
io.sockets.on('connection', function(socket) {
...
socket.on("sendNotification", function(data) {
var roomBName = data.room_name.replace("room-",""),
found = [];
var roomSelected = _.find(rooms, function (room) { return room.id == roomBName });
for (var person in people) {
for (var i = 0, numAttending = roomSelected.peopleAttending.length; i < numAttending; i++) {
if (people[person].name == roomSelected.peopleAttending[i]) {
found.push(person);
}
}
}
for (var i = 0, numFound = found.length; i < numFound; i++) {
**result = sqlquery(9,2);**
io.to(found[i]).emit('notification', result);
};
});
Your sqlquery() function will not accomplish anything useful. Because connection.query() is asynchronous, that means it provides the response sometime LATER after sqlquery() has already finished.
The only way in node.js to use an async result is to actually use it in the callback that provides it. You don't just stuff it into some other variable and expect the result to be there for you in other code. Instead, you use it inside that callback or you call some other function from the callback and pass it the data.
Here's one way, you could change your sqlquery() function:
var sqlquery = function(uID, vs, callback){
connection.query("SELECT * FROM notification WHERE kid = ? AND v = ? ORDER BY id DESC",[uID,vs])
.on("result", function (data){
callback(null, data);
});
// need to add error handling here if the query returns an error
// by calling callback(err)
};
Then, you could use the sqlquery function like this:
found.forEach(function(person, index) {
sqlquery(..., function(err, result) {
if (err) {
// handle an error here
} else {
io.to(person).emit('notification', result);
}
});
});
And, it looks like you probably have similar async issues in other places too like in connection.connect().
In addition to #jfriend00, this could be done with new ES6 feature Promise :
var sqlquery = function(uID, vs){
return new Promise(function(resolve, reject){
connection.query("SELECT * FROM notification WHERE kid = ? AND v = ? ORDER BY id DESC",[uID,vs])
.on("result", function (data){
resolve(data);
});
});
};
Now you can use it like :
found.forEach(function(person, index) {
sqlquery(...)
.then(function(result){
io.to(person).emit('notification', result);
});
});
I use the following code to loop insert 1000000 documents to mongodb,but i found node process takes up a lot of memory,my client are dead.
db.collection("batch_insert", function (err, collection) {
if (!err) {
var count = 0;
for (var i = 0; i < 1000000; i++) {
collection.insert({hello:'world', ok:'OKOKOK'}, {safe:true, serializeFunctions:false}, function (err, result) {
count++;
if (1000000 == count) {
db.close();
}
});
}
} else {
console.log(err);
}
});
Your for cycle blocks event loop. And it can't go to nextTick and handle query results until all queries sended to mongodb. You need to use asynchronous way to batch insert data.
Something like this:
var mongo = require('mongodb');
var Inserter = function (collection) {
this.collection = collection;
this.data = [];
this.maxThreads = 6;
this.currentThreads = 0;
this.batchSize = 5000;
this.queue = 0;
this.inserted = 0;
this.startTime = Date.now();
};
Inserter.prototype.add = function(data) {
this.data.push(data);
};
// Use force=true for last insert
Inserter.prototype.insert = function(force) {
var that = this;
if (this.data.length >= this.batchSize || force) {
if (this.currentThreads >= this.maxThreads) {
this.queue++;
return;
}
this.currentThreads++;
console.log('Threads: ' + this.currentThreads);
this.collection.insert(this.data.splice(0, this.batchSize), {safe:true}, function() {
that.inserted += that.batchSize;
var currentTime = Date.now();
var workTime = Math.round((currentTime - that.startTime) / 1000)
console.log('Speed: ' + that.inserted / workTime + ' per sec');
that.currentThreads--;
if (that.queue > 0) {
that.queue--;
that.insert();
}
});
}
};
var db = new mongo.Db('test', new mongo.Server('localhost', 27017, {}), {native_parser:false});
db.open(function(err, db) {
db.collection('test', function(err, collection) {
var inserter = new Inserter(collection);
setInterval(function() {
for (var i = 0; i < 5000; i++) {
inserter.add({test:'test'});
}
inserter.insert();
}, 0);
});
});
mongodb, just like any other database, takes some time to process requests. You're throwing a million requests at it, and since nothing in your code blocks, that means that at any time a whole bunch of them are going to be queued up somewhere (most likely in multiple places, with some of them inside the driver's code, others inside node's event loop). That takes more than a little bit of memory.
If the queuing didn't happen, you'd either block or drop some of the requests. There Ain't No Such Thing As A Free Lunch.
NEW POST:
Here is the sample of the working async code without a db.
The problem is, if i replace the vars (data1_nodb,...) with the db.collection.find();
function, all needed db vars received at the end and the for() loop ends not
correct. I hope that explains my problem a bit better. OA
var calc = new Array();
function mach1(callback){
error_buy = 0;
// some vars
for(var x_c99 = 0; x_c99 < array_temp_check0.length;x_c99++){
// some vars
calc[x_c99] = new Array();
calc[x_c99][0]= new Array();
calc[x_c99][0][0] = "dummy1";
calc[x_c99][0][1] = "dummy2";
calc[x_c99][0][2] = "dummy3";
calc[x_c99][0][3] = "dummy4";
calc[x_c99][0][4] = "dummy5";
function start_query(callback) {
data1_nodb = "data1";
data2_nodb = "data2";
data3_nodb = "data3";
data4_nodb = "data4";
calc[x_c99][0][0] = data1_nodb;
calc[x_c99][0][1] = data2_nodb;
calc[x_c99][0][2] = data3_nodb;
callback(data1_nodb,data2_nodb,etc..);
}
start_query(function() {
console.log("start_query OK!");
function start_query2(callback) {
data4_nodb = "data5";
data5_nodb = "data6";
data6_nodb = "data7";
calc[x_c99][0][3] = data4_nodb;
calc[x_c99][0][4] = data5_nodb;
callback(data5_nodb,data6_nodb,etc..);
}
start_query2(function() {
console.log("start_query2 OK!");
function start_query3(callback) {
for(...){
// do something
}
callback(vars...);
}
start_query3(function() {
console.log("start_query3 OK!");
});
});
});
}
callback(calc);
};
function mach2(callback){
mach1(function() {
console.log("mach1 OK!");
for(...){
// do something
}
});
callback(calc,error_buy);
};
mach2(function() {
console.log("mach2 OK 2!");
});
OLD POST:
i try to read data from the mongodb and send them back with a callback to the next
function, that needs the infos from the db to proceed.
Without the mongodb read functions it works perfect but now i dont know how
i can send the db vars out of the two inner functions to the first callback function.
Hope someone can help me...
Thanks
var error = 0; var var1 = "yessir";
function start_query(callback) {
var db_name = "db1";
db[db_name].find({name:var1},{data1:1, data2:1, data3:1, data4:1}, function(err, data_catch,callback) {
if( err || !data_catch ) {
console.log("Problem finding data_catch:" + err);
} else {
data_catch.forEach( function(data_catch_finder,callback) {
data1_db = data_catch_finder.data1;
data2_db = data_catch_finder.data2;
data3_db = data_catch_finder.data3;
data4_db = data_catch_finder.data4;
if(data1_db == "" || data2_db == "" || data3_db == "" || data4_db == ""){error = 1; console.log("Error: data_catch_finder");}
callback(data1_db, data2_db, data3_db, data4_db, error);
});
}
});
callback(data1, data2, data3, data4, error);
}
//########################################################################
start_query(function() {
function start_query2(callback) {
console.log("DATA1 from callback:" + data1_db);
console.log("DATA2 from callback:" + data2_db);
console.log("DATA3 from callback:" + data3_db);
console.log("DATA4 from callback:" + data4_db);
var var_no_db = "testing";
//do something else and callback
callback(var_no_db);
}
start_query2(function() {
console.log("Var from callback start_query2:" + var_no_db);
console.log("The end");
});
});
your callback signature are issuing callback as a parameter.
As far as I can understand your code, you need to keep reference of the first callback, the one you receive here: function start_query(callback).
In every callback function you made the mistake to bind the variable name callback to the parameter from the mongo driver (a simple undefined i think).
You can fix it removing every reference of callback from the signature of your inner functions.
a simple example:
function async (cb) {
// params: Error, data
cb(null, 'moo');
}
function foo(callback) {
async(function(err, data, callback){
console.log(callback); // undefined
});
console.log(callback); // Moo
}
foo('Moo');
Take a look at Eloquent Javascript to better understand the JS context switching;
edit
The only way to wait the results of an async function is recall the first callback inside the last nested callback.
function ugly_nested (callback) {
dbquery('...', function(err, data_01) {
if (!! err) return callback(err);
dbquery('...', function(err, data_02) {
if (!! err) return callback(err);
dbquery('...', function(err, data_03) {
if (!! err) return callback(err);
callback(null, data_01, data_02, data_03);
});
});
});
}
ugly_nested(function(err, data01, data02, data03) {
if (!! err) throw err;
manage_data(data01, data02, data03);
});
The FOR loop is synchronous, but, the database calls are asynchronous, so, the for loop will end before the database returns his results. If you really need that for loop you can try out one of the nice flow control libraries out there