Memory issues on knex.js when using streams

Memory issues on knex.js when using streams - node.js

I'm trying to export a whole sqlite3 database table to CSV using knex.js. As the table can up to 300000 rows, i use streams to don't have memory issues. But if i look the memory usage of my app it up to 800MB or i have an "out of memory" error.
How can i handle a large query result with knex.js on sqlite3 database?
Below a sample of code :
knex.select().from(table).stream(function (stream) {
var stringifier = stringify(opts);
var fileStream = fs.createWriteStream(file);
var i = 0;
stringifier.on('readable', function() {
var row;
while (row = stringifier.read()) {
fileStream.write(row);
console.log("row " + i++); //debug
}
});
fileStream.once('open', function(fd) {
stream.pipe(stringifier);
});
});
EDIT
Seems knex.js streams for sqlite3 database are "fake" streams.
Below the source code of the stream function for sqlite3 in knex :
Runner_SQLite3.prototype._stream = Promise.method(function(sql, stream, options) {
/*jshint unused: false*/
var runner = this;
return new Promise(function(resolver, rejecter) {
stream.on('error', rejecter);
stream.on('end', resolver);
return runner.query(sql).map(function(row) {
stream.write(row);
}).catch(function(err) {
stream.emit('error', err);
}).then(function() {
stream.end();
});
});
});
We see that it waits for the request to be executed before create the stream
from the result array.
VERSION:
Knex.Js 0.7.5
node 0.12
Thx for your help.

I think there are no solutions. I use limit and offset to get all data step by step with knex.js and i write each row in a write stream.
An implementation example for those who wants :
exportTable: function(table, writeStream) {
var totalRows;
var rowLimit = _config.ROW_LIMIT;
return DatabaseManager.countAll(table).then(function(count) {
totalRows = count[0]['count(*)'];
var iterations = new Array(Math.ceil(totalRows / rowLimit));
return Promise.reduce(iterations, function(total, item, index) {
return _knex.select().from(table).limit(rowLimit).offset(index * rowLimit).map(function(row) {
writeStream.write(row);
}).catch(function(err) {
return Promise.reject(err);
});
}, 0).then(function() {
return Promise.resolve();
}).catch(function(err) {
return Promise.reject(err);
});
}).catch(function(err) {
console.log(err);
return Promise.reject(err);
});
}

Related

How to handle a lot of result rows in Oracle Database with Node,js

I'm doing a query to a oracle database but there are a lot of results rows so I'm getting a JavaScript heap out of memory error.
Is there a way to handle the result of the query by batch or something for not get this out of memory error?

You didn't specify, but I'm guessing you're using node-oracledb. If so, then the trick is to use a ResultSet object. This will provide a read-consistent view of the data (a single point in time view of the data from the time the query started) while allowing you to stream data across the wire:
https://oracle.github.io/node-oracledb/doc/api.html#streamingresults
Here's an example that uses a QueryStream instance. The QueryStream class is just a wrapper on a ResultSet to provide streaming APIs.
var oracledb = require('oracledb');
var dbConfig = require('./dbconfig.js');
var rowcount = 0;
oracledb.getConnection(
{
user : dbConfig.user,
password : dbConfig.password,
connectString : dbConfig.connectString
},
function(err, connection) {
if (err) {
console.error(err.message);
return;
}
var stream = connection.queryStream(
'SELECT first_name, last_name FROM employees ORDER BY employee_id',
[], // no binds
{ fetchArraySize: 150 } // internal buffer size for performance tuning
);
stream.on('error', function (error) {
// console.log("stream 'error' event");
console.error(error);
return;
});
stream.on('metadata', function (metadata) {
// console.log("stream 'metadata' event");
console.log(metadata);
});
stream.on('data', function (data) {
// console.log("stream 'data' event");
console.log(data);
rowcount++;
});
stream.on('end', function (metadata) {
// console.log("stream 'end' event");
stream.destroy(); // the stream should be closed when it has been finished
});
stream.on('close', function () {
// console.log("stream 'close' event");
console.log('Rows selected: ' + rowcount);
connection.close( // Note: do not close connections on 'end'
function(err) {
if (err) {
console.error(err.message);
}
});
});
});
You're probably going to be streaming the results to a file or an HTTP response object. In either case, you'll likely want proper JSON rather than the individual rows that the driver returns. Have a look at this issue for an example of how you can do that:
https://github.com/oracle/node-oracledb/issues/908#issuecomment-390006986

How to 'pipe' oracle-db data from 'on data' event

I've been using node-oracledb for a few months and I've managed to achieve what I have needed to so far.
I'm currently working on a search app that could potentially return about 2m rows of data from a single call. To ensure I don't get a disconnect from the browser and the server, I thought I would try queryStream so that there is a constant flow of data back to the client.
I implemented the queryStream example as-is, and this worked fine for a few hundred thousand rows. However, when the returned rows is greater than one million, Node runs out of memory. By logging and watching both client and server log events, I can see that client is way behind the server in terms of rows sent and received. So, it looks like Node is falling over because it's buffering so much data.
It's worth noting that at this point, my selectstream implementation is within a req/res function called via Express.
To return the data, I do something like....
stream.on('data', function (data) {
rowcount++;
let obj = new myObjectConstructor(data);
res.write(JSON.stringify(obj.getJson());
});
I've been reading about how streams and pipe can help with flow, so what I'd like to be able to do is to be able to pipe the results from the query to a) help with flow and b) to be able to pipe the results to other functions before sending back to the client.
E.g.
function getData(req, res){
var stream = myQueryStream(connection, query);
stream
.pipe(toSomeOtherFunction)
.pipe(yetAnotherFunction)
.pipe(res);
}
I'm spent a few hours trying to find a solution or example that allows me to pipe results, but I'm stuck and need some help.
Apologies if I'm missing something obvious, but I'm still getting to grips with Node and especially streams.
Thanks in advance.

There's a bit of an impedance mismatch here. The queryStream API emits rows of JavaScript objects, but what you want to stream to the client is a JSON array. You basically have to add an open bracket to the beginning, a comma after each row, and a close bracket to the end.
I'll show you how to do this in a controller that uses the driver directly as you have done, instead of using separate database modules as I advocate in this series.
const oracledb = require('oracledb');
async function get(req, res, next) {
try {
const conn = await oracledb.getConnection();
const stream = await conn.queryStream('select * from employees', [], {outFormat: oracledb.OBJECT});
res.writeHead(200, {'Content-Type': 'application/json'});
res.write('[');
stream.on('data', (row) => {
res.write(JSON.stringify(row));
res.write(',');
});
stream.on('end', () => {
res.end(']');
});
stream.on('close', async () => {
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
stream.on('error', async (err) => {
next(err);
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
} catch (err) {
next(err);
}
}
module.exports.get = get;
Once you get the concepts, you can simplify things a bit with a reusable Transform class which allows you to use pipe in the controller logic:
const oracledb = require('oracledb');
const { Transform } = require('stream');
class ToJSONArray extends Transform {
constructor() {
super({objectMode: true});
this.push('[');
}
_transform (row, encoding, callback) {
if (this._prevRow) {
this.push(JSON.stringify(this._prevRow));
this.push(',');
}
this._prevRow = row;
callback(null);
}
_flush (done) {
if (this._prevRow) {
this.push(JSON.stringify(this._prevRow));
}
this.push(']');
delete this._prevRow;
done();
}
}
async function get(req, res, next) {
try {
const toJSONArray = new ToJSONArray();
const conn = await oracledb.getConnection();
const stream = await conn.queryStream('select * from employees', [], {outFormat: oracledb.OBJECT});
res.writeHead(200, {'Content-Type': 'application/json'});
stream.pipe(toJSONArray).pipe(res);
stream.on('close', async () => {
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
stream.on('error', async (err) => {
next(err);
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
} catch (err) {
next(err);
}
}
module.exports.get = get;

Rather than writing your own logic to create a JSON stream, you can use JSONStream to convert an object stream to (stringified) JSON, before piping it to its destination (res, process.stdout etc) This saves the need to muck around with .on('data',...) events.
In the example below, I've used pipeline from node's stream module rather than the .pipe method: the effect is similar (with better error handling I think). To get objects from oracledb.queryStream, you can specify option {outFormat: oracledb.OUT_FORMAT_OBJECT} (docs). Then you can make arbitrary modifications to the stream of objects produced. This can be done using a transform stream, made perhaps using through2-map, or if you need to drop or split rows, through2. Below the stream is sent to process.stdout after being stringified as JSON, but you could equally send to it express's res.
require('dotenv').config() // config from .env file
const JSONStream = require('JSONStream')
const oracledb = require('oracledb')
const { pipeline } = require('stream')
const map = require('through2-map') // see https://www.npmjs.com/package/through2-map
oracledb.getConnection({
user: process.env.DB_USER,
password: process.env.DB_PASSWORD,
connectString: process.env.CONNECT_STRING
}).then(connection => {
pipeline(
connection.queryStream(`
select dual.*,'test' as col1 from dual
union select dual.*, :someboundvalue as col1 from dual
`
,{"someboundvalue":"test5"} // binds
,{
prefetchRows: 150, // for tuning
fetchArraySize: 150, // for tuning
outFormat: oracledb.OUT_FORMAT_OBJECT
}
)
,map.obj((row,index) => {
row.arbitraryModification = index
return row
})
,JSONStream.stringify() // false gives ndjson
,process.stdout // or send to express's res
,(err) => { if(err) console.error(err) }
)
})
// [
// {"DUMMY":"X","COL1":"test","arbitraryModification":0}
// ,
// {"DUMMY":"X","COL1":"test5","arbitraryModification":1}
// ]

Bulk insert into Postgres with brianc/node-postgres

I have the following code in nodejs that uses the pg (https://github.com/brianc/node-postgres)
My code to create subscriptions for an employee is as such.
client.query(
'INSERT INTO subscriptions (subscription_guid, employer_guid, employee_guid)
values ($1,$2,$3)', [
datasetArr[0].subscription_guid,
datasetArr[0].employer_guid,
datasetArr[0].employee_guid
],
function(err, result) {
done();
if (err) {
set_response(500, err, res);
logger.error('error running query', err);
return console.error('error running query', err);
}
logger.info('subscription with created');
set_response(201);
});
As you have already noticed datasetArr is an array. I would like to create mass subscriptions for more than one employee at a time. However I would not like to loop through the array. Is there a way to do it out of the box with pg?

I did a search for the same question, but found no solution yet.
With the async library it is very simple to use the query several times, and do the necessary error handling.
May be this code variant helps.
(for inserting 10.000 small json objects to an empty database it took 6 sec).
Christoph
function insertData(item,callback) {
client.query('INSERT INTO subscriptions (subscription_guid, employer_guid, employee_guid)
values ($1,$2,$3)', [
item.subscription_guid,
item.employer_guid,
item.employee_guid
],
function(err,result) {
// return any err to async.each iterator
callback(err);
})
}
async.each(datasetArr,insertData,function(err) {
// Release the client to the pg module
done();
if (err) {
set_response(500, err, res);
logger.error('error running query', err);
return console.error('error running query', err);
}
logger.info('subscription with created');
set_response(201);
})

It looks for me that the best way is the usage PostgreSQL json functions:
client.query('INSERT INTO table (columns) ' +
'SELECT m.* FROM json_populate_recordset(null::your_custom_type, $1) AS m',
[JSON.stringify(your_json_object_array)], function(err, result) {
if(err) {
console.log(err);
} else {
console.log(result);
}
});

To do Bulk insert into Postgresql from NodeJS, the better option would be to use 'COPY' Command provided by Postgres and pg-copy-streams.
Code snippet from : https://gist.github.com/sairamkrish/477d20980611202f46a2d44648f7b14b
/*
Pseudo code - to serve as a help guide.
*/
const copyFrom = require('pg-copy-streams').from;
const Readable = require('stream').Readable;
const { Pool,Client } = require('pg');
const fs = require('fs');
const path = require('path');
const datasourcesConfigFilePath = path.join(__dirname,'..','..','server','datasources.json');
const datasources = JSON.parse(fs.readFileSync(datasourcesConfigFilePath, 'utf8'));
const pool = new Pool({
user: datasources.PG.user,
host: datasources.PG.host,
database: datasources.PG.database,
password: datasources.PG.password,
port: datasources.PG.port,
});
export const bulkInsert = (employees) => {
pool.connect().then(client=>{
let done = () => {
client.release();
}
var stream = client.query(copyFrom('COPY employee (name,age,salary) FROM STDIN'));
var rs = new Readable;
let currentIndex = 0;
rs._read = function () {
if (currentIndex === employees.length) {
rs.push(null);
} else {
let employee = employees[currentIndex];
rs.push(employee.name + '\t' + employee.age + '\t' + employee.salary + '\n');
currentIndex = currentIndex+1;
}
};
let onError = strErr => {
console.error('Something went wrong:', strErr);
done();
};
rs.on('error', onError);
stream.on('error', onError);
stream.on('end',done);
rs.pipe(stream);
});
}
Finer details explained in this link

Create your data structure as:
[ [val1,val2],[val1,val2] ...]
Then convert it into a string:
JSON.stringify([['a','b'],['c']]).replace(/\[/g,"(").replace(/\]/g,")").replace(/"/g,'\'').slice(1,-1)
append it to the query and you are done!
Agreed it has string parsing costs but its way cheaper than single inserts.

Use an ORM; eg: Objection.
Also, Increase the Connection pool size based on your db server and the number of active connection you need.
someMovie
.$relatedQuery('actors')
.insert([
{firstName: 'Jennifer', lastName: 'Lawrence'},
{firstName: 'Bradley', lastName: 'Cooper'}
])
.then(function (actors) {
console.log(actors[0].firstName);
console.log(actors[1].firstName);
});

Iterating over a mongodb cursor serially (waiting for callbacks before moving to next document)

Using mongoskin, I can do a query like this, which will return a cursor:
myCollection.find({}, function(err, resultCursor) {
resultCursor.each(function(err, result) {
}
}
However, I'd like to call some async functions for each document, and only move on to the next item on the cursor after this has called back (similar to the eachSeries structure in the async.js module). E.g:
myCollection.find({}, function(err, resultCursor) {
resultCursor.each(function(err, result) {
externalAsyncFunction(result, function(err) {
//externalAsyncFunction completed - now want to move to next doc
});
}
}
How could I do this?
Thanks
UPDATE:
I don't wan't to use toArray() as this is a large batch operation, and the results might not fit in memory in one go.

A more modern approach that uses async/await:
const cursor = db.collection("foo").find({});
while(await cursor.hasNext()) {
const doc = await cursor.next();
// process doc here
}
Notes:
This may be even more simple to do when async iterators arrive.
You'll probably want to add try/catch for error checking.
The containing function should be async or the code should be wrapped in (async function() { ... })() since it uses await.
If you want, add await new Promise(resolve => setTimeout(resolve, 1000)); (pause for 1 second) at the end of the while loop to show that it does process docs one after the other.

If you don't want to load all of the results into memory using toArray, you can iterate using the cursor with something like the following.
myCollection.find({}, function(err, resultCursor) {
function processItem(err, item) {
if(item === null) {
return; // All done!
}
externalAsyncFunction(item, function(err) {
resultCursor.nextObject(processItem);
});
}
resultCursor.nextObject(processItem);
}

since node.js v10.3 you can use async iterator
const cursor = db.collection('foo').find({});
for await (const doc of cursor) {
// do your thing
// you can even use `await myAsyncOperation()` here
}
Jake Archibald wrote a great blog post about async iterators, that I came to know after reading #user993683's answer.

This works with large dataset by using setImmediate:
var cursor = collection.find({filter...}).cursor();
cursor.nextObject(function fn(err, item) {
if (err || !item) return;
setImmediate(fnAction, item, arg1, arg2, function() {
cursor.nextObject(fn);
});
});
function fnAction(item, arg1, arg2, callback) {
// Here you can do whatever you want to do with your item.
return callback();
}

If someone is looking for a Promise way of doing this (as opposed to using callbacks of nextObject), here it is. I am using Node v4.2.2 and mongo driver v2.1.7. This is kind of an asyncSeries version of Cursor.forEach():
function forEachSeries(cursor, iterator) {
return new Promise(function(resolve, reject) {
var count = 0;
function processDoc(doc) {
if (doc != null) {
count++;
return iterator(doc).then(function() {
return cursor.next().then(processDoc);
});
} else {
resolve(count);
}
}
cursor.next().then(processDoc);
});
}
To use this, pass the cursor and an iterator that operates on each document asynchronously (like you would for Cursor.forEach). The iterator needs to return a promise, like most mongodb native driver functions do.
Say, you want to update all documents in the collection test. This is how you would do it:
var theDb;
MongoClient.connect(dbUrl).then(function(db) {
theDb = db; // save it, we'll need to close the connection when done.
var cur = db.collection('test').find();
return forEachSeries(cur, function(doc) { // this is the iterator
return db.collection('test').updateOne(
{_id: doc._id},
{$set: {updated: true}} // or whatever else you need to change
);
// updateOne returns a promise, if not supplied a callback. Just return it.
});
})
.then(function(count) {
console.log("All Done. Processed", count, "records");
theDb.close();
})

You can do something like this using the async lib. The key point here is to check if the current doc is null. If it is, it means you are finished.
async.series([
function (cb) {
cursor.each(function (err, doc) {
if (err) {
cb(err);
} else if (doc === null) {
cb();
} else {
console.log(doc);
array.push(doc);
}
});
}
], function (err) {
callback(err, array);
});

You could use a Future:
myCollection.find({}, function(err, resultCursor) {
resultCursor.count(Meteor.bindEnvironment(function(err,count){
for(var i=0;i<count;i++)
{
var itemFuture=new Future();
resultCursor.nextObject(function(err,item)){
itemFuture.result(item);
}
var item=itemFuture.wait();
//do what you want with the item,
//and continue with the loop if so
}
}));
});

You can get the result in an Array and iterate using a recursive function, something like this.
myCollection.find({}).toArray(function (err, items) {
var count = items.length;
var fn = function () {
externalAsyncFuntion(items[count], function () {
count -= 1;
if (count) fn();
})
}
fn();
});
Edit:
This is only applicable for small datasets, for larger one's you should use cursors as mentioned in other answers.

A more modern approach that uses for await:
const cursor = db.collection("foo").find({});
for await(const doc of cursor) {
// process doc here with await
await processDoc(doc);
}

You could use simple setTimeOut's. This is an example in typescript running on nodejs (I am using promises via the 'when' module but it can be done without them as well):
import mongodb = require("mongodb");
var dbServer = new mongodb.Server('localhost', 27017, {auto_reconnect: true}, {});
var db = new mongodb.Db('myDb', dbServer);
var util = require('util');
var when = require('when'); //npm install when
var dbDefer = when.defer();
db.open(function() {
console.log('db opened...');
dbDefer.resolve(db);
});
dbDefer.promise.then(function(db : mongodb.Db){
db.collection('myCollection', function (error, dataCol){
if(error) {
console.error(error); return;
}
var doneReading = when.defer();
var processOneRecordAsync = function(record) : When.Promise{
var result = when.defer();
setTimeout (function() {
//simulate a variable-length operation
console.log(util.inspect(record));
result.resolve('record processed');
}, Math.random()*5);
return result.promise;
}
var runCursor = function (cursor : MongoCursor){
cursor.next(function(error : any, record : any){
if (error){
console.log('an error occurred: ' + error);
return;
}
if (record){
processOneRecordAsync(record).then(function(r){
setTimeout(function() {runCursor(cursor)}, 1);
});
}
else{
//cursor up
doneReading.resolve('done reading data.');
}
});
}
dataCol.find({}, function(error, cursor : MongoCursor){
if (!error)
{
setTimeout(function() {runCursor(cursor)}, 1);
}
});
doneReading.promise.then(function(message : string){
//message='done reading data'
console.log(message);
});
});
});

read csv with headers then upload each row to couchdb using node/grunt

I would like to read a csv file and upload each row to a couchdb using a grunt task. At this point I am not yet doing any database validation such as checking if the record already exists but will have to do that at some point also.
Currently this is what I am doing and the problem is only the first 65 rows, of the first sub task named people is being uploaded to couchdb.
I know this has something to do with asynchronous execution but just can't work out how to do this
Gruntils.js
csv2couch: {
people: {
db: 'http://localhost:5984/db',
collectionName: 'person',
src:['./data/schema3/people.csv']
},
organisms: {
db: '<%= qmconfig.COUCHDBURL %>',
collectionName: 'organism',
src:['./data/schema3/organisms.csv']
}
}
csv2couch.js
'use strict';
var nanolib = require('nano'),
csv = require('csv'),
urls = require('url'),
fs = require('fs');
module.exports = function(grunt) {
grunt.registerMultiTask('csv2couch', 'Parse csv file and upload data to couchdb.', function() {
var done, parts, dbname, _this, collectionName;
_this = this;
done = this.async();
parts = urls.parse(this.data.db);
dbname = parts.pathname.replace(/^\//, '');
collectionName = this.data.collectionName;
// Merge task-specific and/or target-specific options with these defaults.
var options = this.options({});
// couchdb connection
try {
var nano = nanolib(parts.protocol + '//' + parts.host);
} catch (e) {
grunt.warn(e);
done(e, null);
}
// database connection
var db = nano.use(dbname);
// process each source csv file
this.filesSrc.forEach(function(f) {
console.log('source file:', f);
csv()
.from.path(f, {
columns:true,
delimeter:',',
quote:'"'
})
.on('record', function(row,index){
console.log('#'+index, row);
save(row, collectionName);
})
.on('end', function(count){
console.log('Number of lines: '+count);
done();
})
.on('error', function(error){
console.log(error.message);
done(error);
});
});
function save (data, collectionName) {
// document ID is concatenation of collectionName and ID
var docID = collectionName[0]+'_'+data.ID;
// add some additional data
data.type = collectionName;
// insert data into couchdb
db.insert(data, docID, function(err, body, header) {
if (err) {
console.log('[db.insert] ', err.message);
return;
}
});
}
});
};

You're right, the async code is incorrect. The CSV file is being read to the end before all your records are saved. You need to call done only when your last record has been saved.
Your save method needs to take a callback
var rowsRead = 0, // the number of rows read from the csv file
rowsWritten = 0; // the number of rows written to CouchdDb
caller:
.on('record', function(row,index){
rowsRead++;
save(row, collectionName, function(err){
if(err){
return done(err);
}
rowsWritten++;
if(rowsRead===rowsWritten){ // check if we've written all records to CouchDb
done();
}
});
})
save method:
function save (data, collectionName, callback) {
// document ID is concatenation of collectionName and ID
var docID = collectionName[0]+'_'+data.ID;
// add some additional data
data.type = collectionName;
// insert data into couchdb
db.insert(data, docID, callback);
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Memory issues on knex.js when using streams - node.js

Related

How to handle a lot of result rows in Oracle Database with Node,js

How to 'pipe' oracle-db data from 'on data' event

Bulk insert into Postgres with brianc/node-postgres

Iterating over a mongodb cursor serially (waiting for callbacks before moving to next document)

read csv with headers then upload each row to couchdb using node/grunt

Categories

Resources