Querying Large Dataset in Oracle Database from NodeJS

Querying Large Dataset in Oracle Database from NodeJS - node.js

I'm currently working on a project from work where i have an Oracle 10 database table with about 310K give or take 10-30K rows.
The goal is to display those rows in an angular frontend, however returning all of those through NodeJS is taking a lot of time.
Given that I'm using both NodeJS and oracledb for the first time, i'm assuming i must be missing something?
var oracledb = require('oracledb');
var config = require(__dirname+'/../db.js');
function get(req,res,next)
{
var table = req.query.table;
var meta;
oracledb.getConnection(config.oracle)
.then( function(connection)
{
var stream = connection.queryStream('SELECT * FROM '+table);
stream.on('error', function (error)
{
console.error(error);
return next(err);
});
stream.on('metadata', function (metadata) {
console.log(metadata);
});
stream.on('data', function (data) {
console.log(data);
});
stream.on('end', function ()
{
connection.release(
function(err) {
if (err) {
console.error(err.message);
return next(err);
}
});
});
})
.catch(function(err){
if(err){
connection.close(function(err){
if(err){
console.error(err.message);
return next(err);
}
});
}
})
}
module.exports.get = get;

30 MB is a lot of data to load into the front end. It can work in some cases, such as desktop web apps where the benefits of "caching" the data offset the time needed to load it (and increased stale data is okay). But it will not work well in other cases, such as mobile.
Keep in mind that the 30 MB must be moved from the DB to Node.js and then from Node.js to the client. The network connections between these will greatly impact performance.
I'll point out a few things that can help performance, though not all are exactly related to this question.
First, if you're using a web server, you should be using a connection pool, not dedicated/one-off connections. Generally, you'd create the connection pool in your index/main/app.js and start the web server after that's done and ready.
Here's an example:
const oracledb = require('oracledb');
const express = require('express');
const config = require('./db-config.js');
const thingController = require('./things-controller.js');
// Node.js used 4 background threads by default, increase to handle max DB pool.
// This must be done before any other calls that will use the libuv threadpool.
process.env.UV_THREADPOOL_SIZE = config.poolMax + 4;
// This setting can be used to reduce the number of round trips between Node.js
// and the database.
oracledb.prefetchRows = 10000;
function initDBConnectionPool() {
console.log('Initializing database connection pool');
return oracledb.createPool(config);
}
function initWebServer() {
console.log('Initializing webserver');
app = express();
let router = new express.Router();
router.route('/things')
.get(thingController.get);
app.use('/api', router);
app.listen(3000, () => {
console.log('Webserver listening on localhost:3000');
});
}
initDBConnectionPool()
.then(() => {
initWebServer();
})
.catch(err => {
console.log(err);
});
That will create a pool which is added to the internal pool cache in the driver. This allows you to easily access it from other modules (example later).
Note that when using connection pools, it's generally a good idea to increase the thread pool available to Node.js to allow each connection in the pool to work concurrently. An example of this is included above.
In addition, I'm increasing the value of oracledb.prefetchRows. This setting is directly related to your question. Network round trips are used to move the data between the DB and Node.js. This setting allows you to adjust the number of rows fetched with each round trip. So as prefetchRows goes higher, fewer round trips are needed and performance increases. Just be careful you don't go to high as per the memory you have in your Node.js server.
I ran a generic test that mocked the 30 MB dataset size. When oracledb.prefetchRows was left at the default of 100, the test finished in 1 minute 6 seconds. When I bumped this up to 10,000, it finished in 27 seconds.
Okay, moving on to "things-controller.js" which is based on your code. I've updated the code to do the following:
Assert that table is a valid table name. Your current code is vulnerable to SQL injection.
Use a promise chain that emulates a try/catch/finally block to close the connection just once and return the first error encountered (if needed).
Work so I could run the test.
Here's the result:
const oracledb = require('oracledb');
function get(req, res, next) {
const table = req.query.table;
const rows = [];
let conn;
let err; // Will store the first error encountered
// You need something like this to preven SQL injection. The current code
// is wide open.
if (!isSimpleSqlName(table)) {
next(new Error('Not simple SQL name'));
return;
}
// If you don't pass a config, the connection is pulled from the 'default'
// pool in the cache.
oracledb.getConnection()
.then(c => {
return new Promise((resolve, reject) => {
conn = c;
const stream = conn.queryStream('SELECT * FROM ' + table);
stream.on('error', err => {
reject(err);
});
stream.on('data', data => {
rows.push(data);
});
stream.on('end', function () {
resolve();
});
});
})
.catch(e => {
err = err || e;
})
.then(() => {
if (conn) { // conn assignment worked, need to close/release conn
return conn.close();
}
})
.catch(e => {
console.log(e); // Just log, error during release doesn't affect other work
})
.then(() => {
if (err) {
next(err);
return;
}
res.status(200).json(rows);
});
}
module.exports.get = get;
function isSimpleSqlName(name) {
if (name.length > 30) {
return false;
}
// Fairly generic, but effective. Would need to be adjusted to accommodate quoted identifiers,
// schemas, etc.
if (!/^[a-zA-Z0-9#_$]+$/.test(name)) {
return false;
}
return true;
}
I hope that helps. Let me know if you have questions.

Related

MongoError: pool destroyed when fetching all data without conditions

I am new to mongoDb, as I am trying to query from different collection and in order to do that, when I am fetching data from category collection I mean when I am running select * from collection it is throwing error, MongoError: pool destroyed.
As per my understanding it is because of some find({}) is creating a pool and that is being destroyed.
The code which I am using inside model is below,
const MongoClient = require('mongodb').MongoClient;
const dbConfig = require('../configurations/database.config.js');
export const getAllCategoriesApi = (req, res, next) => {
return new Promise((resolve, reject ) => {
let finalCategory = []
const client = new MongoClient(dbConfig.url, { useNewUrlParser: true });
client.connect(err => {
const collection = client.db(dbConfig.db).collection("categories");
debugger
if (err) throw err;
let query = { CAT_PARENT: { $eq: '0' } };
collection.find(query).toArray(function(err, data) {
if(err) return next(err);
finalCategory.push(data);
resolve(finalCategory);
// db.close();
});
client.close();
});
});
}
When my finding here is when I am using
let query = { CAT_PARENT: { $eq: '0' } };
collection.find(query).toArray(function(err, data) {})
When I am using find(query) it is returning data but with {} or $gte/gt it is throwing Pool error.
The code which I have written in controller is below,
import { getAllCategoriesListApi } from '../models/fetchAllCategory';
const redis = require("redis");
const client = redis.createClient(process.env.REDIS_PORT);
export const getAllCategoriesListData = (req, res, next, query) => {
// Try fetching the result from Redis first in case we have it cached
return client.get(`allstorescategory:${query}`, (err, result) => {
// If that key exist in Redis store
if (false) {
res.send(result)
} else {
// Key does not exist in Redis store
getAllCategoriesListApi(req, res, next).then( function ( data ) {
const responseJSON = data;
// Save the Wikipedia API response in Redis store
client.setex(`allstorescategory:${query}`, 3600, JSON.stringify({ source: 'Redis Cache', responseJSON }));
res.send(responseJSON)
}).catch(function (err) {
console.log(err)
})
}
});
}
Can any one tell me what mistake I am doing here. How I can fix pool issue.
Thanking you in advance.

I assume that toArray is asynchronous (i.e. it invokes the callback passed in as results become available, i.e. read from the network).
If this is true the client.close(); call is going to get executed prior to results having been read, hence likely yielding your error.
The close call needs to be done after you have finished iterating the results.
Separately from this, you should probably not be creating the client instance in the request handler like this. Client instances are expensive to create (they must talk to all of the servers in the deployment before they can actually perform queries) and generally should be created per running process rather than per request.

many queries postgres (node), no parallel queries?

I am running a node server with the postgres-node (pg) package.
I wrote a program, which requests n-queries (for instance 20,000) at once to my postgres database.
When I do this with several clients who want to query 20,000 at once too, there is no parallelity. That means, the requests of the second client will be queued until the first client finished all his queries.
Is this a normal behavior for postgres? If yes, how can I prevent that one user gets all the ressources (and the others have to wait) if there is no parallelity?
This is my code:
const express = require('express');
const app = express();
const { Pool } = require("pg");
const pool = new Pool();
benchmark(){
pool.connect((err, client, done) => {
if (err) throw err;
client.query("SELECT * from member where m_id = $1", [1], (err, res) => {
done();
if (err) {
console.log(err.stack);
} else {
console.log(res.rows[0]);
}
});
});
}
app.get('/', function(req, res) {
for(let i=0;i<20000;i++){
benchmark();
}
});

First you need to create a connection pool, here's an example with node's pg in a separate module (node-pg-sql.js) for convenience:
node-pg-sql.js:
const { Pool } = require('pg');
const pool = new Pool(fileNameConfigPGSQL);
module.exports = {
query: (text, params, callback) => {
const start = Date.now()
return pool.query(text, params, (err, res) => {
const duration = Date.now() - start
// console.log('executed query', { text, duration, rows: res.rowCount })
callback(err, res)
})
},
getClient: (callback) => {
pool.connect((err, client, done) => {
const query = client.query.bind(client)
// monkey patch
client.query = () => {
client.lastQuery = arguments
client.query.apply(client, arguments)
}
// Timeout 5 sek
const timeout = setTimeout(() => {
// console.error('A client has been checked out for more than 5 seconds!')
// console.error(`The last executed query on this client was: ${client.lastQuery}`)
}, 5000)
const release = (err) => {
// 'done' Methode - returns client to the pool
done(err)
// clear Timeouts
clearTimeout(timeout)
// reset der Query-Method before Monkey Patch
client.query = query
}
callback(err, client, done)
})
}
}
In your postgresql.conf (on linux normally under /var/lib/pgsql/data/postgresql.conf) set max-connection to the desired value:
max_connection = 300
Keep in mind:
Each PostgreSQL connection consumes RAM for managing the connection or the client using it. The more connections you have, the more RAM you will be using that could instead be used to run the database.
While increasing your max-connections, you need to increase shared_buffers and kernel.shmmax as well in order for the client-connection increase to be effective .
Whenever you want to run a query from in one of your routes/endpoints just require the separate client-pool-file like:
const db = require('../../../node-pg-sql');
module.exports = (router) => {
router.get('/someRoute', (req, res) => {
console.log(`*****************************************`);
console.log(`Testing pg..`);
let sqlSelect = `SELECT EXISTS (
SELECT 1
FROM pg_tables
WHERE schemaname = 'someschema'
)`;
db.query(sqlSelect, (errSelect, responseSelect) => {
if (errSelect) {
/* INFO: Error while querying table */
console.log(`*****************************************`);
console.log(`ERROR WHILE CHECKING CONNECTION: ${errSelect}`);
}
else {
// INFO: No error from database
console.log(`*****************************************`);
console.log(`CONNECTION TO PGSQL WAS SUCCESSFUL..`);
res.json({ success: true, message: responseSelect, data:responseSelect.rows[0].exists });
}
})
});
}
EDIT:
"there is no parallelity.."
Node is asynchronous, you can either work with promises or spawn more clients/pools and tune your max-connections (as explained in my answer, but keep performance of your host-machine in mind), but with multiple clients running around 20.000 queries, they won't resolve with a result instantly or parallel. What is the exact goal you try to achieve?
"Is this a normal behavior for postgres?"
This is due to node's event-loop as well as due to certain performance-limitation of the host-machine running the Postgres.

How to 'pipe' oracle-db data from 'on data' event

I've been using node-oracledb for a few months and I've managed to achieve what I have needed to so far.
I'm currently working on a search app that could potentially return about 2m rows of data from a single call. To ensure I don't get a disconnect from the browser and the server, I thought I would try queryStream so that there is a constant flow of data back to the client.
I implemented the queryStream example as-is, and this worked fine for a few hundred thousand rows. However, when the returned rows is greater than one million, Node runs out of memory. By logging and watching both client and server log events, I can see that client is way behind the server in terms of rows sent and received. So, it looks like Node is falling over because it's buffering so much data.
It's worth noting that at this point, my selectstream implementation is within a req/res function called via Express.
To return the data, I do something like....
stream.on('data', function (data) {
rowcount++;
let obj = new myObjectConstructor(data);
res.write(JSON.stringify(obj.getJson());
});
I've been reading about how streams and pipe can help with flow, so what I'd like to be able to do is to be able to pipe the results from the query to a) help with flow and b) to be able to pipe the results to other functions before sending back to the client.
E.g.
function getData(req, res){
var stream = myQueryStream(connection, query);
stream
.pipe(toSomeOtherFunction)
.pipe(yetAnotherFunction)
.pipe(res);
}
I'm spent a few hours trying to find a solution or example that allows me to pipe results, but I'm stuck and need some help.
Apologies if I'm missing something obvious, but I'm still getting to grips with Node and especially streams.
Thanks in advance.

There's a bit of an impedance mismatch here. The queryStream API emits rows of JavaScript objects, but what you want to stream to the client is a JSON array. You basically have to add an open bracket to the beginning, a comma after each row, and a close bracket to the end.
I'll show you how to do this in a controller that uses the driver directly as you have done, instead of using separate database modules as I advocate in this series.
const oracledb = require('oracledb');
async function get(req, res, next) {
try {
const conn = await oracledb.getConnection();
const stream = await conn.queryStream('select * from employees', [], {outFormat: oracledb.OBJECT});
res.writeHead(200, {'Content-Type': 'application/json'});
res.write('[');
stream.on('data', (row) => {
res.write(JSON.stringify(row));
res.write(',');
});
stream.on('end', () => {
res.end(']');
});
stream.on('close', async () => {
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
stream.on('error', async (err) => {
next(err);
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
} catch (err) {
next(err);
}
}
module.exports.get = get;
Once you get the concepts, you can simplify things a bit with a reusable Transform class which allows you to use pipe in the controller logic:
const oracledb = require('oracledb');
const { Transform } = require('stream');
class ToJSONArray extends Transform {
constructor() {
super({objectMode: true});
this.push('[');
}
_transform (row, encoding, callback) {
if (this._prevRow) {
this.push(JSON.stringify(this._prevRow));
this.push(',');
}
this._prevRow = row;
callback(null);
}
_flush (done) {
if (this._prevRow) {
this.push(JSON.stringify(this._prevRow));
}
this.push(']');
delete this._prevRow;
done();
}
}
async function get(req, res, next) {
try {
const toJSONArray = new ToJSONArray();
const conn = await oracledb.getConnection();
const stream = await conn.queryStream('select * from employees', [], {outFormat: oracledb.OBJECT});
res.writeHead(200, {'Content-Type': 'application/json'});
stream.pipe(toJSONArray).pipe(res);
stream.on('close', async () => {
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
stream.on('error', async (err) => {
next(err);
try {
await conn.close();
} catch (err) {
console.log(err);
}
});
} catch (err) {
next(err);
}
}
module.exports.get = get;

Rather than writing your own logic to create a JSON stream, you can use JSONStream to convert an object stream to (stringified) JSON, before piping it to its destination (res, process.stdout etc) This saves the need to muck around with .on('data',...) events.
In the example below, I've used pipeline from node's stream module rather than the .pipe method: the effect is similar (with better error handling I think). To get objects from oracledb.queryStream, you can specify option {outFormat: oracledb.OUT_FORMAT_OBJECT} (docs). Then you can make arbitrary modifications to the stream of objects produced. This can be done using a transform stream, made perhaps using through2-map, or if you need to drop or split rows, through2. Below the stream is sent to process.stdout after being stringified as JSON, but you could equally send to it express's res.
require('dotenv').config() // config from .env file
const JSONStream = require('JSONStream')
const oracledb = require('oracledb')
const { pipeline } = require('stream')
const map = require('through2-map') // see https://www.npmjs.com/package/through2-map
oracledb.getConnection({
user: process.env.DB_USER,
password: process.env.DB_PASSWORD,
connectString: process.env.CONNECT_STRING
}).then(connection => {
pipeline(
connection.queryStream(`
select dual.*,'test' as col1 from dual
union select dual.*, :someboundvalue as col1 from dual
`
,{"someboundvalue":"test5"} // binds
,{
prefetchRows: 150, // for tuning
fetchArraySize: 150, // for tuning
outFormat: oracledb.OUT_FORMAT_OBJECT
}
)
,map.obj((row,index) => {
row.arbitraryModification = index
return row
})
,JSONStream.stringify() // false gives ndjson
,process.stdout // or send to express's res
,(err) => { if(err) console.error(err) }
)
})
// [
// {"DUMMY":"X","COL1":"test","arbitraryModification":0}
// ,
// {"DUMMY":"X","COL1":"test5","arbitraryModification":1}
// ]

nodejs pg transactions without nesting

I would like to know if it's possible to run a series of SQL statements and have them all committed in a single transaction.
The scenario I am looking at is where an array has a series of values that I wish to insert into a table, not individually but as a unit.
I was looking at the following item which provides a framework for transactions in node using pg. The individual transactions appear to be nested within one another so I am unsure of how this would work with an array containing a variable number of elements.
https://github.com/brianc/node-postgres/wiki/Transactions
var pg = require('pg');
var rollback = function(client, done) {
client.query('ROLLBACK', function(err) {
//if there was a problem rolling back the query
//something is seriously messed up. Return the error
//to the done function to close & remove this client from
//the pool. If you leave a client in the pool with an unaborted
//transaction weird, hard to diagnose problems might happen.
return done(err);
});
};
pg.connect(function(err, client, done) {
if(err) throw err;
client.query('BEGIN', function(err) {
if(err) return rollback(client, done);
//as long as we do not call the `done` callback we can do
//whatever we want...the client is ours until we call `done`
//on the flip side, if you do call `done` before either COMMIT or ROLLBACK
//what you are doing is returning a client back to the pool while it
//is in the middle of a transaction.
//Returning a client while its in the middle of a transaction
//will lead to weird & hard to diagnose errors.
process.nextTick(function() {
var text = 'INSERT INTO account(money) VALUES($1) WHERE id = $2';
client.query(text, [100, 1], function(err) {
if(err) return rollback(client, done);
client.query(text, [-100, 2], function(err) {
if(err) return rollback(client, done);
client.query('COMMIT', done);
});
});
});
});
});
My array logic is:
banking.forEach(function(batch){
client.query(text, [batch.amount, batch.id], function(err, result);
}

pg-promise offers a very flexible support for transactions. See Transactions.
It also supports partial nested transactions, aka savepoints.
The library implements transactions automatically, which is what should be used these days, because too many things can go wrong, if you try organizing a transaction manually as you do in your example.
See a related question: Optional INSERT statement in a transaction

Here's a simple TypeScript solution to avoid pg-promise
import { PoolClient } from "pg"
import { pool } from "../database"
const tx = async (callback: (client: PoolClient) => void) => {
const client = await pool.connect();
try {
await client.query('BEGIN')
try {
await callback(client)
await client.query('COMMIT')
} catch (e) {
await client.query('ROLLBACK')
}
} finally {
client.release()
}
}
export { tx }
Usage:
...
let result;
await tx(async client => {
const { rows } = await client.query<{ cnt: string }>('SELECT COUNT(*) AS cnt FROM users WHERE username = $1', [username]);
result = parseInt(rows[0].cnt) > 0;
});
return result;

Initialization of db connection - nodejs

I want to use gridfs-stream in a nodejs application.
A simple example is given in the documentation:
var mongoose = require('mongoose');
var Grid = require('gridfs-stream');
Grid.mongo = mongoose.mongo;
mongoose.connect('mongodb://localhost:27017/test');
// make sure the db instance is open before passing into `Grid`
mongoose.connection.once('open', function () {
var gfs = Grid(mongoose.connection);
// all set!
})
My problem is described by the comment:
make sure the db instance is open before passing into Grid
I try to use gfs in a post request. Now when the code gets initialized, the gfs variable is not defined yet.
api.post('/upload', function(req, res) {
req.pipe(gfs.createWriteStream({
filename: 'test'
}).on('close', function(savedFile){
console.log('file saved', savedFile);
return res.json({file: savedFile});
}));
})
Initializing my route from a callback seems kind of odd.
I read in this post (Asynchronous initialization of Node.js module) that require('') is performed synchronous, and since I rely on the connection being established, I'm kind of forced to wait
Basically I'm not sure if I should use a async pattern on startup now, or if I just miss a more elegant way to solve this.

I have a very similar problem with my server. In my case I am reading https certs asynchronously, the software version from git asynchronously and I want to make sure I have it all together by the time the user comes to log in so I can pass the software version back as a reply to login.
The solution is to use promises. Create the promises on user start up for each activity. Then in the code where you want to be sure its all ready, just call then on either the promise itself or Promise.all(array of promises).then()
Here is an example of what I am doing to read the ssl certs to start the server
class Web {
constructor(manager,logger) {
var self = this;
this.server = false;
this.logger = logger;
var key = new Promise((resolve,reject) => {
fs.readFile(path.resolve(__dirname, 'key.pem'),(err,data) => {
if (err) {
reject(err);
} else {
resolve(data);
}
});
});
var cert = new Promise((resolve,reject) => {
fs.readFile(path.resolve(__dirname, 'certificate.pem'), (err,data) => {
if (err) {
reject(err);
} else {
resolve(data);
}
});
});
Promise.all([key,cert]).then(values => {
var certs = {
key: values[0],
cert: values[1],
};
return certs;
}).then(certs => {
self.server = require('http2').createServer(certs,(req,res) => {
// NOW Started and can do the rest of the stuff
});
self.server.listen(...);
});
NEEDS SOME MORE CLOSING BRACKETS

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Querying Large Dataset in Oracle Database from NodeJS - node.js

Related

MongoError: pool destroyed when fetching all data without conditions

many queries postgres (node), no parallel queries?

How to 'pipe' oracle-db data from 'on data' event

nodejs pg transactions without nesting

Initialization of db connection - nodejs

Categories

Resources