This question has been asked BUT the answer that the OP accepted did not answer my particular needs.
closing mongodb connection in node.js while inserting lot of data
I have a utility script that adds a lot of records to multiple collections. Really it is just an import that uses byline to read the VERY LARGE text files and then inserts the data into a collection:
var MongoClient = require("mongodb").MongoClient;
var fs = require("fs");
var byline = require("byline");
var inStream = fs.createReadStream("data.txt", { encoding: "utf8" });
var byLineStream = byline.createStream(inStream);
MongoClient.connect("mongodb://localhost:27017/test", { native_parser: true}, function(err, db) {
var collection = db.collection("Data");
db.dropCollection("Data", function(err, result) {
byLineStream.on("data", function(line) {
var o = parseLineToObject(line);
collection.insert(o);
});
});
});
The answer suggested was to push all the data into an array and then use a single write and a callback to close the database when it is done. This is not a good answer as the files I am working with are very large and so consume large amounts of memory.
Another solution presented to a similar question was to use the async package to create an array of functions and then run them in parallel. Another bust but at least it doesn't create a huge single insert.
So the question: How do I close MongoDB connection once all the inserts are complete so that my script exits and does not hang?
I should add that I have tried the counting method where I increment a counter variable in the insert callback. It doesn't work because at some point in the inserts, the callbacks execute and complete faster than the inserts complete causing the counter to hit 0 while the inserts are still going, and thus closing the db.
You should set a flag when all lines have been read:
var readAllLines = false;
byLineStream.on("end", function() {
readAllLines = true;
});
Next, you check for that flag after inserting each record. However, you also need to keep track of the number of lines that have been read, and how many are inserted, so you'll only close the database if all lines have been inserted (even out of order).
Putting everything together:
db.dropCollection("Data", function(err, result) {
var lineCount = 0;
var readAllLines = false;
byLineStream.on("end", function() {
readAllLines = true;
});
byLineStream.on("data", function(line) {
lineCount++;
var o = parseLineToObject(line);
collection.insert(o, { w : 1 }, function() {
if (--lineCount === 0 && readAllLines) {
// we've read and inserted all lines
db.close();
}
});
});
});
However, I do believe that passing a callback to insert ('safe mode') is slower than your current solution, where you call insert but don't wait wait for its result. To speed things up, instead of writing each lines separately, you can buffer an X amount of lines before inserting them in one statement.
Something similar to this (without the line counting):
var buffer = [];
byLineStream.on("data", function(line) {
buffer.push(parseLineToObject(line));
if (buffer.length > 100 || readAllLines) {
collection.insert(buffer, { w : 1 }, function() {
if (readAllLines) {
db.close();
}
});
buffer = [];
}
});
var MongoClient = require("mongodb").MongoClient;
var fs = require("fs");
var byline = require("byline");
var inStream = fs.createReadStream("data.txt", { encoding: "utf8" });
var byLineStream = byline.createStream(inStream);
MongoClient.connect("mongodb://localhost:27017/test", { native_parser: true}, function(err, db) {
var collection = db.collection("Data");
db.dropCollection("Data", function(err, result) { //I am completely replacing collection
var insertCount = 0;
var doneReadingFile = false;
byLineStream.on("end",function(line) {
doneReadingFile = true;
});
byLineStream.on("data", function(line) {
var o = parseLineToObject(line);
collection.insert(o,function(err, result) {
insertCount--;
if (insertCount === 0 && doneReadingFile) {
db.close();
}
});
});
});
});
Related
How can I stream a response using an in memory DB?
I'm using Loki JS as an in memory DB. There is a particular resource where I must return the entire contents of a table (cannot be paginated) and that table can grow to 500,000 items or so, which is about 300mb.
In other cases, I have used fs.createReadStream to get a file and stream it back to the user:
fs.createReadStream('zips.json')
.on('data', function() {
res.write(...)
})
.on('end', function() {
res.end();
})
This has worked great for large files, but how can I do something equivalent using an in memory DB?
const items = lokiDb.addCollection('items');
items.insert('a bunch of items ...');
// I would now like to stream items via res.write
res.write(items)
Currently, res.write(items) will cause memory problems as Node is trying to return the entire response at once.
As far as I can tell, there is no native stream provider in Loki, though I may have missed it. What you may want to do instead is listen to the 'insert' event on the collection and write that, like so:
const items = lokiDb.addCollection('items');
items.on('insert', (results) => {
res.write(results);
});
items.insert('a bunch of items ...');
If I'm correct, basically your problem is that readStreams only read from files, and that you want to read from an in-memory data structure. A solution might be to define your own readStream class, slightly modifying the prototype stream.Readable._read method:
var util = require('util');
var stream = require('stream');
"use strict";
var begin=0, end=0;
var options = {
highWaterMark: 16384,
encoding: null,
objectMode: false
};
util.inherits(InMemoryStream, stream.Readable);
function InMemoryStream(userDefinedOptions, resource){
if (userDefinedOptions){
for (var key in userDefinedOptions){
options.key = userDefinedOptions[key];
}
}
this.resource = resource;
stream.Readable.call(this, options);
}
InMemoryStream.prototype._read = function(size){
end += size;
this.push(this.resource.slice(begin, end));
begin += size;
}
exports.InMemoryStream = InMemoryStream;
exports.readStream = function(UserDefinedOptions, resource){
return new InMemoryStream(UserDefinedOptions, resource);
}
You convert your in-memory datastructure (in the following example an array) to a readStream, and pipe this through to a writeStream, as follows:
"use strict";
var fs = require('fs');
var InMemoryStream = require('/home/regular/javascript/poc/inmemorystream.js');
var stored=[], writestream, config={};
config = {
encoding: null,
fileToRead: 'raphael.js',
fileToWrite: 'secondraphael.js'
}
fs.readFile(config.fileToRead, function(err, data){
if (err) return console.log('Error when opening file', err);
stored = data;
var inMemoryStream = InMemoryStream.readStream({encoding: config.encoding}, stored);
writestream = fs.createWriteStream(config.fileToWrite);
inMemoryStream.pipe(writestream);
inMemoryStream.on('error', function(err){
console.log('in memory stream error', err);
});
});
This is my first personal project in Nodejs. I'm trying to get in live soon.
I have a Nodejs server that uses sqlite3. There are only 3000 rows with word, transform and a precalculated value each in a column of the table, which is already populated.
I need to just lookup the word in the DB to be sure it is valid.
var sqlite3 = require("sqlite3").verbose();
var db = new sqlite3.Database("validate.db");
db.get("SELECT * FROM tab WHERE w = ?", word, function(err, row) {
if(err) { console.log("Lookup:",word,", Error => ",err); return false; }
return true;
});
The problem is that the caller of this code has a lot of context and need the operation to wait. So, I tried this
function dbLookup(db, w) {
return function(cb) {
var rows = [];
db.exec('SELECT w FROM tab WHERE w = "'+w+'"')
.on('row', function(r) {
rows.push(r)
})
.on('result', function() {
cb(rows);
});
}
async.each([word], function(w) {
dbLookup(this.db, w);
}, function(err) {
if(err) {console.log("...ERROR..."); return false; }
else {console.log("...SUCCESS..."); return true; }
});
This doesn't solve the wait issue as the callback can fire at its own pace.
I read that promise using something like bluebird can solve my problem
but now I'm not able to get the value/result of the query out:
I've been pulling my hair for so long. Please help me either get the async working or get the result back from the promise approach.
var async = require('async');
var sqlite3 = require("sqlite3").verbose();
var db = new sqlite3.Database("validate.db");
function check(word, callback) {
db.get("SELECT count(1) cnt FROM tab WHERE w = ?", word, callback)
}
async.map(words, check, function(err, results) {
if (err)
return console.log('Query error')
var all_checked = results.filter(function(r) {
return r.cnt > 0
});
...
});
Or
var sqlite3 = require("sqlite3").verbose();
var db = new sqlite3.Database("validate.db");
db.all("SELECT distinct w FROM tab", function(err, rows) {
var all_checked = words.filter(function (w) {
return rows.indexOf(w) != -1;
})
...
})
I am on scraping actions on nodejs, I am using request to connect to site, cheerio to access to the data and mongodb to store the data extracted. Also I am using async.js to avoid infinite recursion.
I have got a memory problem because my process takes memory and do not free it. I think that the problem is on mongodb because if I don't use mongodb the memory remains stable.
This is my summarized code:
// Use function scrape_urls to process the urls
var q = self.asyn.queue(scrape_urls, 3);
//I push a bunch of urls ...
for (var j = 0; j < self.urls_data.length; j++) {
q.push(self.urls_data[j]);
}
q.drain = function () {
console.log("END");
};
function scrape_urls(data_url, next_action) {
request({
method: 'GET',
url: data_url.url
}, function (err, response, body) {
var $ = cheerio.load(body);
data = { // ... scraped data ... };
mongo_client.connect(connection_string, function (err, db) {
if (err) { return console.dir(err); }
var collection = db.collection('foo');
collection.insert(data);
next_action();
});
});
};
As I say, if I avoid to use mongodb and only I connect to the urls using request, the memory will not grow endless, I think that connecting to mongodb is the problem.
Any ideas?
Problem solved.
I leave here a solution. I made a helper to reuse the connection and maintain only one (after all, nodejs is single-thread):
var MongoDbHelper = function (mongo_client, connection_string){
var self = this;
this.mongo_client = mongo_client;
this.connection_string = connection_string;
this.db = undefined;
self.log = function (thread, str)
{
console.log(new Date().toISOString() + ' ' + process.memoryUsage().rss + ' [' + thread + '] ' + str);
}
self.getcollection = function(collection_name, callback)
{
var collection = null;
try
{
collection = self.db.collection(collection_name);
}
catch(ex)
{
self.db = undefined;
}
// reconnecting if the connection is lost
if(self.db == undefined)
{
self.mongo_client.connect(connection_string, function(err, db) {
self.db = db;
var collection = self.db.collection(collection_name);
callback(err, self.db, collection);
});
}
else
{
callback(null, self.db, collection);
}
}
};
module.exports = MongoDbHelper
Is there something wrong here?
I have this snippet and it shows all the record on my stream?
The statement filters seemed to be not ignored.
I have the secondary index already setup.
// bin type for uid is text
var statement = {
concurrent: true,
nobins: false,
};
statement.filters = [aerospike.filter.equal("idx_mynamespace_myset_uid", "639085555553")];
var query = client.query('mynamespace','myset', statement);
var stream = query.execute();
var count = 0;
stream.on('data', function(rec) {
// process the scanned record here
count++;
console.log(rec);
});
stream.on('error', function(err){
// console.log(err);
});
stream.on('end', function() {
console.log('TOTAL SCANNED:', count++);
process.exit(0)
});
New version of Aerospike's Node.js Client (1.0.31) has fixes for equal and range queries. However, please note that range queries on strings are not supported or recommended and using them may result in unexpected behavior.
I use the following code to loop insert 1000000 documents to mongodb,but i found node process takes up a lot of memory,my client are dead.
db.collection("batch_insert", function (err, collection) {
if (!err) {
var count = 0;
for (var i = 0; i < 1000000; i++) {
collection.insert({hello:'world', ok:'OKOKOK'}, {safe:true, serializeFunctions:false}, function (err, result) {
count++;
if (1000000 == count) {
db.close();
}
});
}
} else {
console.log(err);
}
});
Your for cycle blocks event loop. And it can't go to nextTick and handle query results until all queries sended to mongodb. You need to use asynchronous way to batch insert data.
Something like this:
var mongo = require('mongodb');
var Inserter = function (collection) {
this.collection = collection;
this.data = [];
this.maxThreads = 6;
this.currentThreads = 0;
this.batchSize = 5000;
this.queue = 0;
this.inserted = 0;
this.startTime = Date.now();
};
Inserter.prototype.add = function(data) {
this.data.push(data);
};
// Use force=true for last insert
Inserter.prototype.insert = function(force) {
var that = this;
if (this.data.length >= this.batchSize || force) {
if (this.currentThreads >= this.maxThreads) {
this.queue++;
return;
}
this.currentThreads++;
console.log('Threads: ' + this.currentThreads);
this.collection.insert(this.data.splice(0, this.batchSize), {safe:true}, function() {
that.inserted += that.batchSize;
var currentTime = Date.now();
var workTime = Math.round((currentTime - that.startTime) / 1000)
console.log('Speed: ' + that.inserted / workTime + ' per sec');
that.currentThreads--;
if (that.queue > 0) {
that.queue--;
that.insert();
}
});
}
};
var db = new mongo.Db('test', new mongo.Server('localhost', 27017, {}), {native_parser:false});
db.open(function(err, db) {
db.collection('test', function(err, collection) {
var inserter = new Inserter(collection);
setInterval(function() {
for (var i = 0; i < 5000; i++) {
inserter.add({test:'test'});
}
inserter.insert();
}, 0);
});
});
mongodb, just like any other database, takes some time to process requests. You're throwing a million requests at it, and since nothing in your code blocks, that means that at any time a whole bunch of them are going to be queued up somewhere (most likely in multiple places, with some of them inside the driver's code, others inside node's event loop). That takes more than a little bit of memory.
If the queuing didn't happen, you'd either block or drop some of the requests. There Ain't No Such Thing As A Free Lunch.