Bulk insert into Postgres with brianc/node-postgres - node.js

I have the following code in nodejs that uses the pg (https://github.com/brianc/node-postgres)
My code to create subscriptions for an employee is as such.
client.query(
'INSERT INTO subscriptions (subscription_guid, employer_guid, employee_guid)
values ($1,$2,$3)', [
datasetArr[0].subscription_guid,
datasetArr[0].employer_guid,
datasetArr[0].employee_guid
],
function(err, result) {
done();
if (err) {
set_response(500, err, res);
logger.error('error running query', err);
return console.error('error running query', err);
}
logger.info('subscription with created');
set_response(201);
});
As you have already noticed datasetArr is an array. I would like to create mass subscriptions for more than one employee at a time. However I would not like to loop through the array. Is there a way to do it out of the box with pg?

I did a search for the same question, but found no solution yet.
With the async library it is very simple to use the query several times, and do the necessary error handling.
May be this code variant helps.
(for inserting 10.000 small json objects to an empty database it took 6 sec).
Christoph
function insertData(item,callback) {
client.query('INSERT INTO subscriptions (subscription_guid, employer_guid, employee_guid)
values ($1,$2,$3)', [
item.subscription_guid,
item.employer_guid,
item.employee_guid
],
function(err,result) {
// return any err to async.each iterator
callback(err);
})
}
async.each(datasetArr,insertData,function(err) {
// Release the client to the pg module
done();
if (err) {
set_response(500, err, res);
logger.error('error running query', err);
return console.error('error running query', err);
}
logger.info('subscription with created');
set_response(201);
})

It looks for me that the best way is the usage PostgreSQL json functions:
client.query('INSERT INTO table (columns) ' +
'SELECT m.* FROM json_populate_recordset(null::your_custom_type, $1) AS m',
[JSON.stringify(your_json_object_array)], function(err, result) {
if(err) {
console.log(err);
} else {
console.log(result);
}
});

To do Bulk insert into Postgresql from NodeJS, the better option would be to use 'COPY' Command provided by Postgres and pg-copy-streams.
Code snippet from : https://gist.github.com/sairamkrish/477d20980611202f46a2d44648f7b14b
/*
Pseudo code - to serve as a help guide.
*/
const copyFrom = require('pg-copy-streams').from;
const Readable = require('stream').Readable;
const { Pool,Client } = require('pg');
const fs = require('fs');
const path = require('path');
const datasourcesConfigFilePath = path.join(__dirname,'..','..','server','datasources.json');
const datasources = JSON.parse(fs.readFileSync(datasourcesConfigFilePath, 'utf8'));
const pool = new Pool({
user: datasources.PG.user,
host: datasources.PG.host,
database: datasources.PG.database,
password: datasources.PG.password,
port: datasources.PG.port,
});
export const bulkInsert = (employees) => {
pool.connect().then(client=>{
let done = () => {
client.release();
}
var stream = client.query(copyFrom('COPY employee (name,age,salary) FROM STDIN'));
var rs = new Readable;
let currentIndex = 0;
rs._read = function () {
if (currentIndex === employees.length) {
rs.push(null);
} else {
let employee = employees[currentIndex];
rs.push(employee.name + '\t' + employee.age + '\t' + employee.salary + '\n');
currentIndex = currentIndex+1;
}
};
let onError = strErr => {
console.error('Something went wrong:', strErr);
done();
};
rs.on('error', onError);
stream.on('error', onError);
stream.on('end',done);
rs.pipe(stream);
});
}
Finer details explained in this link

Create your data structure as:
[ [val1,val2],[val1,val2] ...]
Then convert it into a string:
JSON.stringify([['a','b'],['c']]).replace(/\[/g,"(").replace(/\]/g,")").replace(/"/g,'\'').slice(1,-1)
append it to the query and you are done!
Agreed it has string parsing costs but its way cheaper than single inserts.

Use an ORM; eg: Objection.
Also, Increase the Connection pool size based on your db server and the number of active connection you need.
someMovie
.$relatedQuery('actors')
.insert([
{firstName: 'Jennifer', lastName: 'Lawrence'},
{firstName: 'Bradley', lastName: 'Cooper'}
])
.then(function (actors) {
console.log(actors[0].firstName);
console.log(actors[1].firstName);
});

Related

Node JS MSSQL Callbacks

No matter how many different posts, books, tutorials, etc I read I can't seem to grasp promises. I'm trying to execute a chunk of code in Node after the SQL requests have completed, however I can't seem to make it work... Any help at all would be greatly appreciated. I need console.log('done'); to run after the for loop has completed running through the req array:
app.post('/QueryWrite', function (req, res) {
var anyErrors = false;
sql.connect(config).then(function (err) {
if (err) {
console.log(err);
res.send(err);
anyErrors = true;
}
var i;
for (i = 0; i < req.body['query[]'].length; i++) {
var sqlQuery = req.body["query[]"][i];
let sqlRequest = new sql.Request();
sqlRequest.query(sqlQuery, function (err) {
if (err) {
console.log(err);
res.send(err);
anyErrors = true;
}
})
console.log(req.body.authenticatedAs + " wrote to DB: " + sqlQuery);
}
}).then(console.log('done'));
So here is kinda simplified example for database connection.
// database.js file
const mysql = require("mysql");
const utils = require("util");
const dbCon = mysql.createConnection({
debug: false,
host: "localhost",
user: "root",
database: "laravel_test_tweety",
password: "",
multipleStatements: true,
});
dbCon.query = utils.promisify(dbCon.query);
module.exports = dbCon;
So this is how i kinda use my database connections. After this lets try with server.js
//server.js
// ... here your code with creation of app and such,
// and we are importing dbCon from database.js
const dbCon = require("./database");
// server that listens for requests called `app`
app.post("/QueryWrite", function (req, res) {
/**
* in case if you are sending an object as queries
* we need them as array
* { query1: 'some query', query2: 'some query' }
* =>
* ['some query', 'some query']
*
* So that's why we are using `Object.values`
*/
let queries = Object.values(req.body["query[]"]);
if (queries) {
// So there are couple of options here
// Let's say you are only selecting stuff from db
let query = queries.join(";");
// This is gonna give you something like
// "SELECT 1; SELECT 2"
// If you want this to work you need `multipleStatements: true` on
// mysqlConfig from database.js
dbCon
.query(query)
.then((results) => {
console.log(results);
// if needed do some stuff here and send res back.
res.json(results);
})
.catch(err => {
console.error(err);
// do some other stuff
});
}
});
After everything from here, If you need to do insert or update, you also need to be able to send data, I can't really get around my head to that right now but maybe with in the array seperate those and query one by one?
I'm aware that this is not exactly what you want but atleast for select or delete queries this would work.

Do math with the result of two separate queries node.js

I pretty much exclusively program with Python but am trying to learn Node. My mind is so stuck in synchronisty that i'm making up words and banging my head against the wall trying to figure out callbacks. I realize a call back is a function passed to a function? I've successfully written very simple call backs but can't get this code to work the way i'd like it to.
Essentially, I need to multiply the results of these two queries, and then I'll be writing an if statement based on that math.
Hoping someone can show me how I can write a function that calls these functions, waits for the results, multiplies them together, and contains an if statement for me to do something with.
This needs to be done with node as i'm adding it to a chat bot developed with node.
var getSkuCount = function() {
pool.getConnection(function(err, connection) {
connection.query("select count(sku) from products_per_store where store_id = " + sID + " group by store_id", function (err, record) {
if (err) {
console.error('DATABASE ERROR:', err);
}
return record
connection.release();
});
});
};
var getAssetCount = function () {
console.log("getting total of scrapers attached to " + store_id);
pool.getConnection(function(err, connection) {
connection.query("SELECT count(*) FROM external_crawl_settings WHERE store_id = " + sID + " group by store_id", function (err, record) {
if (err) {
console.log(err);
return console.error('DATABASE ERROR:', err);
}
connection.release();
});
});
}
var skuCount = getSkuCount();
var assetCount = getAssetCount();
if skuCount * assetCount > 50000 {
do something
};
I've eliminated the global variables assetCount, skuCount and took a different approach in addressing all your questions. This solution requires 2 different files. 1 for managing connections and 1 for consolidating all your routes.
You need to have this in your index.js or similar server start up script for your app.
app-server.js // server start up file
'use strict';
let express = require('express');
let connectionManager = require('./connection-manager');
//read from your config file
let config = {
port: 7007,
host: 'host',
user: 'user',
password: 'password',
database: 'database',
connectionLimit: 'limit'
};
function startServer(config) {
let application = require('../'); // your application with all the routes
server = http.createServer(application);
return new Promise((resolve, reject) => {
server.listen(config.port, ()=> {
return resolve();
}).on('error', (err)=> {
return reject(err);
});
});
}
connectionManager.init(config).then(()=> {
return startServer(config);
}).then(()=> {
console.log(`server is up at ${config.port}`);
}).catch((err) => {
console.log('err while starting server', err.stack);
});
connection-manager.js // connection manager
'use strict';
let mysql = require('promise-mysql');
let connectionPool;
class Connections {
static init(config) {
return mysql.createPool({
host: config.host,
user: config.user,
password: config.password,
database: config.database,
connectionLimit: config.limit
}).getConnection().then((connection)=> {
connectionPool = connection;
});
}
static getConnection() {
// you can call this across your applications
return connectionPool;
}
static releaseConnection() {
//call this only if you want to shut the application
connectionPool.close(); // or equivalent method available
}
}
module.exports = Connections;
sample.js
'use strict';
let connection = require('./connection-manager').getConnection();
function compute(sid) {
let skuCount = connection.query('select count(sku) "cnt" from products_per_store where store_id = ' + sID + ' group by store_id');
let assetCount = connection.query('SELECT count(*) "cnt" FROM external_crawl_settings WHERE store_id = ' + sID + ' group by store_id');
return Promise.all([
skuCount,
assetCount
]).then((results)=> {
let skuCount = results[0];
let assetCount = results[1];
if (skuCount * assetCount > 50000) {
//do something
}
}).catch((err) => {
console.log('DATABASE ERROR:', err.stack);
});
}
Also, is there a limit on how many open connections you can have?
Since Connection pool handles the connection recycling for you, it depends on the hardware resources you have. But I could recommend you to start with the defaults, and keep increasing until you get the performance you want.
My slack-bot randomly crashes and I can't figure out the reason why.
Do you use process managers like pm2? . If so, Only on seeing it, can help you further to debug it.Process manager keeps track of all the exception, error you could normal get since they are managing the application.
Does a program end only when there's an uncaught error?
Yes. If you haven't handled process.on(uncaughtException), process.on(unhandledRejections). It is a good practice in node.js land to let the program crash and restart.
Could my bot be hitting a connection limit and crashing?
Can't say it. But you can get additional clues by inspecting your /var/log/mysql/error.log, error stack trace in logs, pm2 logs.
How do you release the connection?
You don't have to, if you are using any connection pool.
pool.getConnection().then(function(connection) {
let skuCount = connection.query('select count(sku) "cnt" from products_per_store where store_id = ' + sID + ' group by store_id');
let assetCount = connection.query('SELECT count(*) "cnt" FROM external_crawl_settings WHERE store_id = ' + sID + ' group by store_id');
return Promise.all([
skuCount,
assetCount
]).then((results)=> {
let skuCount = parseInt(results[0][0].cnt);
let assetCount = parseInt(results[1][0].cnt);
if (skuCount * assetCount > 50000) {
console.log('Too many inputs to run without permission');
}
console.log(skuCount*assetCount);
}).catch((err) => {
console.log('DATABASE ERROR:', err.stack);
});
}).catch(function(err) {
console.log(err);
});

Memory issues on knex.js when using streams

I'm trying to export a whole sqlite3 database table to CSV using knex.js. As the table can up to 300000 rows, i use streams to don't have memory issues. But if i look the memory usage of my app it up to 800MB or i have an "out of memory" error.
How can i handle a large query result with knex.js on sqlite3 database?
Below a sample of code :
knex.select().from(table).stream(function (stream) {
var stringifier = stringify(opts);
var fileStream = fs.createWriteStream(file);
var i = 0;
stringifier.on('readable', function() {
var row;
while (row = stringifier.read()) {
fileStream.write(row);
console.log("row " + i++); //debug
}
});
fileStream.once('open', function(fd) {
stream.pipe(stringifier);
});
});
EDIT
Seems knex.js streams for sqlite3 database are "fake" streams.
Below the source code of the stream function for sqlite3 in knex :
Runner_SQLite3.prototype._stream = Promise.method(function(sql, stream, options) {
/*jshint unused: false*/
var runner = this;
return new Promise(function(resolver, rejecter) {
stream.on('error', rejecter);
stream.on('end', resolver);
return runner.query(sql).map(function(row) {
stream.write(row);
}).catch(function(err) {
stream.emit('error', err);
}).then(function() {
stream.end();
});
});
});
We see that it waits for the request to be executed before create the stream
from the result array.
VERSION:
Knex.Js 0.7.5
node 0.12
Thx for your help.
I think there are no solutions. I use limit and offset to get all data step by step with knex.js and i write each row in a write stream.
An implementation example for those who wants :
exportTable: function(table, writeStream) {
var totalRows;
var rowLimit = _config.ROW_LIMIT;
return DatabaseManager.countAll(table).then(function(count) {
totalRows = count[0]['count(*)'];
var iterations = new Array(Math.ceil(totalRows / rowLimit));
return Promise.reduce(iterations, function(total, item, index) {
return _knex.select().from(table).limit(rowLimit).offset(index * rowLimit).map(function(row) {
writeStream.write(row);
}).catch(function(err) {
return Promise.reject(err);
});
}, 0).then(function() {
return Promise.resolve();
}).catch(function(err) {
return Promise.reject(err);
});
}).catch(function(err) {
console.log(err);
return Promise.reject(err);
});
}

node-postgres create database

I am using node-postgres, and at the beginning of my application I want to check whether the database exists or not. So my workflow idea is as following:
Check whether myDb is existing
If it is there, create the tables
If not, then create first the database, then tables
As you see it is a really easy process, however, the driver implementation requires to have a database name postgres://username:password#host/database to be connected, which means you need to connect to a database first.
So what I am doing now is to connect to postgres database at the beginning, making a query to create database, cathing the exception if it is already there, then closing my connection and connecting to the newly created database, then creating the tables. Here is the code:
var conStringPri = 'postgres://' + username + ':' + password + '#' + host +
'/postgres';
var conStringPost = 'postgres://' + username + ':' + password + '#' + host +
'/' + dbName;
pg.connect(conStringPri, function(err, client, done) { // connect to postgres db
if (err)
console.log('Error while connecting: ' + err);
client.query('CREATE DATABASE ' + dbName, function(err) { // create user's db
if (err)
console.log('ignoring the error'); // ignore if the db is there
client.end(); // close the connection
// create a new connection to the new db
pg.connect(conStringPost, function(err, clientOrg, done) {
// create the table
clientOrg.query('CREATE TABLE IF NOT EXISTS ' + tableName + ' ' +
'(...some sql...)';
});
});
});
As you see I am opening and closing the connection twice, and this way seems wrong to me. I'll be glad if you propose a better way, or maybe explain how did you accomplish this.
As you see it is a really easy process, however, the driver
implementation requires to have a database name
postgres://username:password#host/database to be connected, which
means you need to connect to a database first.
It's not because of the driver implementation, it's PostgreSQL itself. It's the same with any other language or driver.
A client needs to be connected to a database in order to do anything, including a CREATE DATABASE. Besides the postgres database, template1 is often used for this purpose too.
Then, since you must connect to the freshly created database to create objects inside it, there's no way to avoid opening another connection.
In short, what you're doing can't be simplified, it's already optimal.
I've just written a module for that: https://github.com/olalonde/pgtools
var pgtools = require('pgtools');
pgtools.createdb({
user: 'postgres',
password: 'some pass',
port: 5432,
host: 'localhost'
}, 'test-db', function (err, res) {
if (err) {
console.error(err);
process.exit(-1);
}
console.log(res);
});
Hopefully it can make your code a bit cleaner.
This is a bit old but I just want to share how I handled this kind of setup.
You need to call the third param from the callback which is the done from pg.connect(conn, (err, client, done) => {}). This will release the connection and bring back to pool.
async.series([
done => {
pg.connect(connPrimary, (err, client, releaseConn) => {
if (err) return done(err)
client.query(`CREATE DATABASE ${conf.database}`, (err) => {
if (err && !~err.message.indexOf('already exists')) {
return done(err)
}
client.end()
releaseConn()
done()
})
})
},
done => {
let connSecondary = `postgres://${conf.user}:${conf.password}#${conf.host}:${conf.port}/${conf.database}`
pg.connect(connSecondary, (err, client, releaseConn) => {
if (err) return done(err)
let createTableQuery = `CREATE TABLE IF NOT EXISTS test_table(_id bigint primary key, co2_field varchar(40) NOT NULL, temp_field int NOT NULL, quality_field decimal NOT NULL, reading_time_field timestamp NULL)`
client.query(createTableQuery, err => {
if (err) return done(err)
releaseConn()
done()
})
})
}
], err => {
should.ifError(err)
doneInit()
})
Here is a script I use which is essentially just executing shell commands with execa:
import execa from 'execa';
class DatabaseService {
public async setupDatabase() {
const logCmd = (cmd: execa.ExecaChildProcess) => {
cmd.stdout.on('data', (data) => {
this.logger.log(data.toString());
});
cmd.stderr.on('data', (data) => {
this.logger.error(data.toString());
});
};
const createUser = () => {
return new Promise<void>((resolve, reject) => {
const cmd = execa('createuser', [Config.databaseUser, '--superuser']);
logCmd(cmd);
let userExists = false;
cmd.stderr.on('data', (data) => {
if (
data
.toString()
.includes(`role "${Config.databaseUser}" already exists`)
) {
userExists = true;
}
});
cmd.on('exit', (code) => {
if (!userExists && code) {
reject(new Error(`Failed to create user for database: ${code}`));
} else {
resolve();
}
});
});
};
const createDatabase = () => {
return new Promise<void>((resolve, reject) => {
const cmd = execa('createdb', [Config.databaseName]);
logCmd(cmd);
let databaseExists = false;
cmd.stderr.on('data', (data) => {
if (
data
.toString()
.includes(`database "${Config.databaseName}" already exists`)
) {
databaseExists = true;
}
});
cmd.on('exit', (code) => {
if (!databaseExists && code) {
reject(new Error(`Failed to create database: ${code}`));
} else {
resolve();
}
});
});
};
await createUser();
await createDatabase();
}
}
As you can see, the script detects if the user or database already exists and will ignore errors in those events, because the intended state of Postgres will have been met, and thats all I care about when I run it.
Install
npm install --save -g pgtools
CLI Example
createdbjs my_awesome_db --user=admin --password=admin

Mongoose with async queue & waterfall

I aim to import large amount of data by Mongoose. As a newbie, I fail to setup the flow control properly with various mechanisms by async. Glad if someone could point to an appropriate solution. Thanks.
var async = require('async'),
mongoose = require('mongoose');
mongoose.connect('mongodb://localhost/test');
var Cat = mongoose.model('Cat', { name: String });
// Imagine this is a huge array with a million items.
var content = ['aaa', 'bbb', 'ccc'];
var queries = [];
content.forEach(function(name) {
queries.push(function(cb) {
var obj = new Cat({ name: name });
obj.save(function(err) {
console.log("SAVED: " + name);
console.log(err);
});
return true;
});
});
// FAILED: async.parallel adds all content to db,
// but it would exhaust the resource with too many parallel tasks.
async.parallel(queries, function(err, result) {
if (err)
return console.log(err);
console.log(result);
});
// FAILED: save the first item but not the rest
async.waterfall(queries, function(err, result) {
if (err)
return console.log(err);
console.log(result);
});
// FAILED: same as async.waterfall, async.queue saves the first item only
var q = async.queue(function(name, cb) {
var obj = new Cat({ name: name });
obj.save(function(err) {
console.log("SAVED: " + name);
console.log(err);
});
})
q.push(content, function (err) {
console.log('finished processing queue');
});
I think eachLimit or eachSeries fit your situation best:
var content = ['aaa', 'bbb', 'ccc'];
async.eachLimit(content, 10, function(name, done) {
var obj = new Cat({ name : name });
obj.save(done);
// if you want to print some status info, use this instead:
//
// obj.save(function(err) {
// console.log("SAVED: " + name);
// console.log(err);
// done(err);
// });
//
}, function(err) {
// handle any errors;
});
With eachLimit, you can run an X amount of queries 'in parallel' (10 in the example above) to speed things up without exhausting resources. eachSeries will wait for the previous save before it continues with the next, so effectively saving one object at a time.
Notice that with each*, you won't get a list with (saved) objects back (it's a bit of a fire-and-forget mechanism where you're not interested in the outcome, bar any errors). If you do want a list of saved objects in the end, you can use the equivalent map* functions: mapLimit and mapSeries.

Resources