Error when do Promise.all (close connection / hang up) - node.js

I have working with google-spreadsheet#2.0.7 package, I have many data to be exported to Google Sheet,
for now I use this code
const insertDataToSheet = async (data, sheet, msg) => {
let query = []
try {
data.map(async item => {
query.push(promisify(sheet.addRow)(item))
})
const result = await Promise.all(query)
if (result) return result
throw new Error(`${msg} Unkown Error`)
} catch (e) {
throw new Error(`${msg} Failed: ${e.message}`)
}
}
This code is working with 100 data or less, but if I use 150+ data the connection not support it.
Error List
- Client network socket disconnected before secure TLS connection was established
- Socket hang up
- Error: HTTP error 429 (Too Many Requests)
Is there any limitation for Promise.all.?
or
Is there any better solution to export batch / bulk data to Google
Spreadsheet?

Promise.all will throw if one of the promise throw. If you want to proceed even if one promise fails, you do not want to rethrow it as in your code above.
you can re-add it to pending queue and try it again.
also, i may consider batching it. divide them into chunks and upload it.
example:
create a pool of worker (number of work = number of cpu cores (default))
run uploading logic with the worker pool
simulate error / retry with Math.random
process.js file
const path = require('path')
const _ = require('lodash')
const Pool = require('piscina')
const BB = require('bluebird')
const workerPool = new Pool({
filename: path.resolve(__dirname, 'worker.js'),
})
const generateData = (numItems = 5) => {
return Array.from({ length: numItems }, (v, idx) => 'item ' + idx)
}
const CHUNK_SIZE = 10
const data = generateData(100)
const chunks = _.chunk(data, CHUNK_SIZE)
BB.map(
chunks,
(chunk) => {
workerPool.runTask(chunk)
},
{ concurrency: 1 /* 1 chunk at a time */ }
)
worker.js file
const retry = require('p-retry')
// your upload logic here
function process(data) {
if (Math.random() > 0.5) {
console.log('processing ', data)
} else {
console.log('fail => retry ', data)
throw new Error('process failed' + data)
}
}
module.exports = (data) => {
return retry(() => process(data), { retries: 10 })
}
run with node process.js

In final i work on this, and find out there is new version of the package google-spreadsheet#3.0.11.
it's change from Google Drive API to Google Sheets API.
It has many changes, but in my case now I can Batch / Bulk insert just with single line.
this is my code now.
const insertDataToSheet = async (data, sheet, msg) => {
try {
const result = await sheet.addRows(data)
if (result) return result
throw new Error(`${msg} Unkown Error`)
} catch (e) {
throw new Error(`${msg} Failed: ${e.message}`)
}
}
I just use sheet.addRows and tada it's working.
My Problem is solved, but with promise I still need to learn,
Thanks for all of your suggestion / attention.

Related

Got incomplete data on stream piping to an express response

Need to convert a DB table to a csv report.
If I immediately unload the entire tablet with one query then the application crashes because the memory runs out. I decided to query data from the table in portions of 100 rows, convert each row into a line of the report and write it into a stream that is piped with an express response.
All this happens nearly like this:
DB query
const select100Users = (maxUserCreationDateStr) => {
return db.query(`
SELECT * FROM users WHERE created_at < to_timestamp(${maxUserCreationDateStr})
ORDER BY created_at DESC LIMIT 100`);
}
stream initialisation
const { PassThrough } = require('stream');
const getUserReportStream = () => {
const stream = new PassThrough();
writeUserReport(stream).catch((e) => stream.emit('error', e));
return stream;
};
piping the stream with an express response
app.get('/report', (req, res) => {
const stream = getUserReportStream();
res.setHeader('Content-Type', 'application/vnd.ms-excel');
res.setHeader(`Content-Disposition', 'attachment; filename="${ filename }"`);
stream.pipe(res);
});
and finally how do I write data to the stream
const writeUserReport(stream) => {
let maxUserCreationDateGlobal = Math.trunc(Date.now() / 1000);
let flag = true;
stream.write(USER_REPORT_HEADER);
while (flag) {
const rows100 = await select100Users(maxUserCreationDateGlobal);
console.log(rows100.length);
if (rows100.length === 0) {
flag = false;
} else {
let maxUserCreationDate = maxUserCreationDateGlobal;
const users100 = await Promise.all(
rows100.map((r) => {
const created_at = r.created_at;
const createdAt = new Date(created_at);
if (created_at && createdAt.toString() !== 'Invalid Date') {
const createdAtNumber = Math.trunc(createdAt.valueOf() / 1000);
maxUserCreationDate = Math.min(maxUserCreationDate, createdAtNumber);
}
return mapUser(r); // returns a promise
})
);
users100.forEach((u) => stream.write(generateCsvRowFromUser(u)));
maxUserCreationDateGlobal = maxUserCreationDate;
if (rows100.length < 100) {
flag = false;
console.log('***');
}
}
}
console.log('end');
stream.end();
};
as a result I see this output in the console:
100 // 100
100 // 200
100 // 300
100 // 400
100 // 500
87 // 587
***
end
But in the downloaded file I get 401 lines (the first one with USER_REPORT_HEADER). It feels like stream.end() closes the stream before all values are read from it.
I tried using BehaviorSubject from rxjs instead of PassThrough in a similar way - the result is the same..
How can I wait for reading from the stream of all the data that I wrote there?
Or maybe someone can recommend an alternative way to solve this problem.
stream.write expects you to pass a callback as a second (or third parameter), to know when the write operation did finish. You can't call write again unless the previous write operation is finished.
So in general I'd suggest to make this whole function async and every time you call stream.write you wrap it into a Promise like
await new Promise((resolve, reject) => stream.write(data, (error) => {
if (error) {
reject(error);
return;
}
resolve();
});
Obviously it would make sense to extract this to some method.
EDIT: Additionally I don't think that's the actual problem. I assume your http connection is just timing out before all the fetching is completed, so the server will eventually close the stream once the timeout deadline is met.

How to await a streaming sql query in node.js

I need to call out to a function that runs a sql query, with row level functionality, and await the entire process before continuing.
Function code:
const sql = require('mssql')
exports.doit = ()=>{
const pool1 = new sql.ConnectionPool(dbConfig);
const pool1Connect = pool1.connect();
pool1.on('error', err => {
console.error('error occurred on pool')
})
await pool1Connect
try {
const request = pool1.request();
request.stream = true;
request.query('select * from dbo.user');
request.on('row', async orow => {
console.log('outer row');
const innerPool = new sql.ConnectionPool(dbConfig);
const innerConnection = innerPool.connect();
innerPool.on('error', err => {
console.error('error occurred on pool')
});
const iConnection = await innerConnection;
connections.push(iConnection);
const innerRequest = innerPool.request();
innerRequest.stream = true;
var iquery = 'select * from dbo.order where userId='+ orow.userId
innerRequest.query(iquery);
innerRequest.on('row', async irow => {
console.log(`User: ${orow.userId} Order: ${irow.orderId}`);
});
innerRequest.on('done', async () => {
console.log('inner done');
iConnection.close();
});
});
request.on('done', async () => {
console.log('outer done');
})
} catch (err) {
console.error('SQL error', err);
}
sql.on('error', err => {
// ... error handler
})
}
Then call the above function like this:
var doit = require('./testmeHandler.js').doit;
doit()
.then(()=>{
console.log("I AM DONE");
});
OR
await doit();
console.log('I AM DONE');
You get the idea...
But what is really happening is, the function gets called, then 'I AM DONE' and then the results of all the sql calls.
Can someone help me get 'I AM DONE' at the bottom? Still getting used to the async/await and promises.
Thanks
After quite a bit of time trying to get this to work synchronously from the caller I gave up and re-wrote the method to use the regular query (not streaming) and implemented my own paging/throttling as to control memory usage. It works great now!
I am using a connection pool to allow for sub queries and other processes to occur async within a batch of results.
I will post the updated code.
Somehow I believe you have jumbled it all up a bit.
Use this
exports.doit = async ()=>
{
const request = new sql.Request(conn)
let records = await request.query('select * from dbo.user')
records.forEach(async r=>{
try{
// do something
const inner = new sql.Request(conn)
let recordInner = await request.query(innerQuery)
recordInner.forEach(async r=>{//do some stuff})
inner.close()
}
catch(err){
//do something with the error
}
records.close()
})
}
The execution:
async execute(){
const result = await doit()
return result
}
execute()
Though I have no idea why you are using two connections at all . Just try writing a more defined query using JOIN OR WHERE subquery. You can achieve all this in a single query instead of a using nested connection. SQL though a bit old, it really is quite powerful.
select * from dbo.order WHERE userId IN (SELECT userId FROM dbo.user)
Makes more sense to me. But, whatever floats your boat.
More on sub-queries: https://www.dofactory.com/sql/subquery

limiting number of parallel request to cassandra db in nodejs

I currently parsing a file and getting its data in order tu push them in my db. To do that I made an array of query and I execute them through a loop.
The problem is that I'm limited to 2048 parallel requests.
This is the code I made:
index.js=>
const ImportClient = require("./scripts/import_client_leasing")
const InsertDb = require("./scripts/insertDb")
const cassandra = require('cassandra-driver');
const databaseConfig = require('./config/database.json');
const authProvider = new cassandra.auth.PlainTextAuthProvider(databaseConfig.cassandra.username, databaseConfig.cassandra.password);
const db = new cassandra.Client({
contactPoints: databaseConfig.cassandra.contactPoints,
authProvider: authProvider
});
ImportClient.clientLeasingImport().then(queries => { // this function parse the data and return an array of query
return InsertDb.Clients(db, queries); //inserting in the database returns something when all the promises are done
}).then(result => {
return db.shutdown(function (err, result) {});
}).then(result => {
console.log(result);
}).catch(error => {
console.log(error)
});
insertDb.js =>
module.exports = {
Clients: function (db, queries) {
DB = db;
return insertClients(queries);
}
}
function insertClients(queries) {
return new Promise((resolve, reject) => {
let promisesArray = [];
for (let i = 0; i < queries.length; i++) {
promisesArray.push(new Promise(function (resolve, reject) {
DB.execute(queries[i], function (err, result) {
if (err) {
reject(err)
} else {
resolve("success");
}
});
}));
}
Promise.all(promisesArray).then((result) => {
resolve("success");
}).catch((error) => {
resolve("error");
});
});
}
I tried multiple things, like adding an await function thats set a timout in my for loop every x seconds (but it doesn't work because i'm already in a promise), i also tried with p-queue and p-limit but it doesn't seems to work either.
I'm kinda stuck here, I'm think I'm missing something trivial but I don't really get what.
Thanks
When submitting several requests in parallel (execute() function uses asynchronous execution), you end up queueing at one of the different levels: on the driver side, on the network stack or on the server side. Excessive queueing affects the total time it takes each operation to complete. You should limit the amount of simultaneous requests at any time, also known as concurrency level, to get high throughput and low latency.
When thinking about implementing it in your code, you should consider launching a fixed amount of asynchronous executions, using your concurrency level as a cap and only adding new operations once executions within that cap completed.
Here is an example on how to limit the amount of concurrent executions when processing items in a loop: https://github.com/datastax/nodejs-driver/blob/master/examples/concurrent-executions/execute-in-loop.js
In a nutshell:
// Launch in parallel n async operations (n being the concurrency level)
for (let i = 0; i < concurrencyLevel; i++) {
promises[i] = executeOneAtATime();
}
// ...
async function executeOneAtATime() {
// ...
// Execute queries asynchronously in sequence
while (counter++ < totalLength) {;
await client.execute(query, params, options);
}
}
Ok, so I found a workaround to reach my goal.
I wrote in a file all my queries
const fs = require('fs')
fs.appendFileSync('my_file.cql', queries[i] + "\n");
and i then used
child_process.exec("cqls --file my_file", function(err, stdout, stderr){})"
to insert in cassandra all my queries

Axios.all, how to configure axios wait time to mitigate hung up?

My application uses an internal webservice for fetching data, i have a job which creates approx 500 requests which getsfired async to complete the fetch operation.
I make use of Axios, by creating an array of axios promises and then resolving them using using Axios.all();
It works fine until some 200 requests but post that i get socket hung up, however on the server side i see the requests are being processed.
How to configure axios to set custom time out, or is it a better idea to splice my promises array and then run them as multiple batches ?
Source code
let getAxiosPromiseArray = (urlList) => {
var axiosArrayofPromise = [];
return new Promise ( (resolve, reject) => {
try {
urlList.forEach ( (URL) => {
axiosArrayofPromise.push(axios.get(URL));
});
resolve(axiosArrayofPromise);
}
catch (err) {
reject("There is a problem getting Axios array of promises " + err);
}
})
}
async function processAxiosPromises (PromiseArray) {
try {
var results = []
results = await axios.all(PromiseArray);
return results;
}
catch(err) {
throw("There was a problem resolving promises array (Axios) " + err);
}
}
getallID().then ( (urlList) => {
return getAxiosPromiseArray(urlList);
}).then( (AxiosPromises) => {
return processAxiosPromises(AxiosPromises);
}).then ((resultData) => {
console.log(resultData);
});
Error
There was a problem resolving promises array (Axios) Error: socket hang up
First, that pair of functions getAxiosPromiseArray() and processAxiosPromises() needs fixing.
Your new Promise() construction is unnecessary. You can simply return Promise.all(arrayofPromise) (or axios.all(...) if you must) and do away with the other function.
Renaming the remaining function to something meaningful, you would end up with eg :
let getData = (urlList) => {
return Promise.all(urlList.map(URL => axios.get(URL)))
.catch(error => {
error.message = "There is a problem getting Axios array of promises " + error.message; // augment the error message ...
throw error; // ... and re-throw the errror.
});
};
And call as follows :
getallID().then(getData)
.then(resultData => {
console.log(resultData);
}).catch(error => {
console.error(error);
});
That will put you on solid ground but, on its own, is unlikely to fix a concurrency problem (if that's what it is), for which the simplest approach is to use Bluebird's Promise.map with the concurrency option.
The caller code can remain the same, just change getData(), as follows:
let getData = (urlList) => {
let concurrency = 10; // play with this value to find a reliable concurrency limit
return Promise.map(urlList, URL => axios.get(URL), {'concurrency': concurrency})
.catch(error => {
error.message = "There is a problem getting Axios array of promises " + error.message;
throw error;
});
};
// where `Promise` is Bluebird.
const axios = require('axios');
const axiosThrottle = require('axios-throttle');
//pass axios object and value of the delay between requests in ms
axiosThrottle.init(axios,200)
const options = {
method: 'GET',
};
const urlList = [
'https://jsonplaceholder.typicode.com/todos/1',
'https://jsonplaceholder.typicode.com/todos/2',
'https://jsonplaceholder.typicode.com/todos/3',
'https://jsonplaceholder.typicode.com/todos/4',
'https://jsonplaceholder.typicode.com/todos/5',
'https://jsonplaceholder.typicode.com/todos/6',
'https://jsonplaceholder.typicode.com/todos/7',
'https://jsonplaceholder.typicode.com/todos/8',
'https://jsonplaceholder.typicode.com/todos/9',
'https://jsonplaceholder.typicode.com/todos/10'
];
const promises = [];
const responseInterceptor = response => {
console.log(response.data);
return response;
};
//add interceptor to work with each response seperately when it is resolved
axios.interceptors.response.use(responseInterceptor, error => {
return Promise.reject(error);
});
for (let index = 0; index < urlList.length; index++) {
options.url = urlList[index];
promises.push(axiosThrottle.getRequestPromise(options, index));
}
//run when all promises are resolved
axios.all(promises).then(responses => {
console.log(responses.length);
});
https://github.com/arekgotfryd/axios-throttle

AWS Lambda function that executes 5000+ promises to AWS SQS is extremely unreliable

I'm writing a Node AWS Lambda function that queries around 5,000 items from my DB and sends them via messages into an AWS SQS queue.
My local environment involves me running my lambda with AWS SAM local, and emulating AWS SQS with GoAWS.
An example skeleton of my Lambda is:
async run() {
try {
const accounts = await this.getAccountsFromDB();
const results = await this.writeAccountsIntoQueue(accounts);
return 'I\'ve written: ' + results + ' messages into SQS';
} catch (e) {
console.log('Caught error running job: ');
console.log(e);
return e;
}
}
There are no performance issues with my getAccountsFromDB() function and it runs almost instantly, returning me an array of 5,000 accounts.
My writeAccountsIntoQueue function looks like:
async writeAccountsIntoQueue(accounts) {
// Extract the sqsClient and queueUrl from the class
const { sqsClient, queueUrl } = this;
try {
// Create array of functions to concurrenctly call later
let promises = accounts.map(acc => async () => await sqsClient.sendMessage({
QueueUrl: queueUrl,
MessageBody: JSON.stringify(acc),
DelaySeconds: 10,
})
);
// Invoke the functions concurrently, using helper function `eachLimit`
let writtenMessages = await eachLimit(promises, 3);
return writtenMessages;
} catch (e) {
console.log('Error writing accounts into queue');
console.log(e);
return e;
}
}
My helper, eachLimit looks like:
async function eachLimit (funcs, limit) {
let rest = funcs.slice(limit);
await Promise.all(
funcs.slice(0, limit).map(
async (func) => {
await func();
while (rest.length) {
await rest.shift()();
}
}
)
);
}
To the best of my understanding, it should be limiting concurrent executions to limit.
Additionally, I've wrapped the AWS SDK SQS client to return an object with a sendMessage function that looks like:
sendMessage(params) {
const { client } = this;
return new Promise((resolve, reject) => {
client.sendMessage(params, (err, data) => {
if (err) {
console.log('Error sending message');
console.log(err);
return reject(err);
}
return resolve(data);
});
});
}
So nothing fancy there, just Promisifying a callback.
I've got my lambda set up to timeout after 300 seconds, and the lambda always times out, and if it doesn't it ends abruptly and misses some final logging that should go on, which makes me thing it may even be erroring somewhere, silently. When I check the SQS queue I'm missing around 1,000 entries.
I can see a couple of issues in your code,
First:
let promises = accounts.map(acc => async () => await sqsClient.sendMessage({
QueueUrl: queueUrl,
MessageBody: JSON.stringify(acc),
DelaySeconds: 10,
})
);
You're abusing async / await. Always bear in mind await will wait until your promise is resolved before continuing with the next one, in this case whenever you map the array promises and call each function item it will wait for the promise wrapped by that function before continuing, which is bad. Since you're only interested in getting the promises back, you could simply do this instead:
const promises = accounts.map(acc => () => sqsClient.sendMessage({
QueueUrl: queueUrl,
MessageBody: JSON.stringify(acc),
DelaySeconds: 10,
})
);
Now, for the second part, your eachLimit implementation looks wrong and very verbose, I've refactored it with help of es6-promise-pool to handle the concurrency limit for you:
const PromisePool = require('es6-promise-pool')
function eachLimit(promiseFuncs, limit) {
const promiseProducer = function () {
while(promiseFuncs.length) {
const promiseFunc = promiseFuncs.shift();
return promiseFunc();
}
return null;
}
const pool = new PromisePool(promiseProducer, limit)
const poolPromise = pool.start();
return poolPromise;
}
Lastly, but very important, have a look at SQS Limits, SQS FIFO has up to 300 sends / sec. Since you are processing 5k items, you could probably up your concurrency limit to 5k / (300 + 50) , approx 15. The 50 could be any positive number, just to move away from the limit a bit.
Also, considering using SendMessageBatch which you could have much more throughput and reach 3k sends / sec.
EDIT
As I suggested above, using sendMessageBatch the throughput is much better, so I've refactored the code mapping your promises to support sendMessageBatch:
function chunkArray(myArray, chunk_size){
var index = 0;
var arrayLength = myArray.length;
var tempArray = [];
for (index = 0; index < arrayLength; index += chunk_size) {
myChunk = myArray.slice(index, index+chunk_size);
tempArray.push(myChunk);
}
return tempArray;
}
const groupedAccounts = chunkArray(accounts, 10);
const promiseFuncs = groupedAccounts.map(accountsGroup => {
const messages = accountsGroup.map((acc,i) => {
return {
Id: `pos_${i}`,
MessageBody: JSON.stringify(acc),
DelaySeconds: 10
}
});
return () => sqsClient.sendMessageBatch({
Entries: messages,
QueueUrl: queueUrl
})
});
Then you can call eachLimit as usual:
const result = await eachLimit(promiseFuncs, 3);
The difference now is every promise processed will send a batch of messages of size n (10 in the example above).

Resources