Avoid Timeout to read 18000 document on Firestore

Avoid Timeout to read 18000 document on Firestore - node.js

My Firestore contains 17500 documents.
It's a list of tokens, in order to send push notifications.
I stock these data in a dictionary, to be able to use them later:
users = {"fr":[token, token], "en":[token, token]....}
My code:
async function getAllUsers() {
const snapshot = await admin.firestore().collection('users').get();
var users= {};
snapshot.forEach(doc => {
const userId = doc.id;
var lang = doc.data().language
if (!(lang in users)) {
users[lang] = [];
users[lang].push(doc.data().token);
} else {
users[lang].push(doc.data().token);
}
});
return users;
}
My code doesn't work anymore. I get a timeout during the foreach loop.
Is it because I have too many documents?
Any idea?
Thanks

It's not clear from your question what exactly is timing out, but there are a couple things you should be aware of.
You certainly can get errors if you attempt to read too many documents in one query. The alternative to this is to use pagination to read the documents in smaller batches so that you don't exceed any query limits.
By default Cloud Functions assumes a 60 second timeout on any function invocations. If you need more than that, you can increase the timeout, but you can only go up to 9 minutes. After that, you have to split your work up among multiple function invocations.

Related

Ability to provide insights from Redis Bull Queue data

I have an application that makes API calls to another system, and it queues these API calls in a queue using Bull and Redis.
However, occasionally it gets bogged down with lots of API calls, or something stops working properly, and I want an easy way for users to check if the system is just "busy". (Otherwise, if they perform some action, and 10 minutes later it hasn't completed, they'll keep trying it again, and then we get a backlog of more entries (and in some cases data issues where they've issued duplicate parts, etc.)
Here's what a single "key" looks like for a successful API call in the queue:
HSET "bull:webApi:4822" "timestamp" "1639085540683"
HSET "bull:webApi:4822" "returnvalue" "{"id":"e1df8bb4-fb6c-41ad-ba62-774fe64b7882","workOrderNumber":"WO309967","status":"success"}"
HSET "bull:webApi:4822" "processedOn" "1639085623027"
HSET "bull:webApi:4822" "data" "{"id":"e1df8bb4-fb6c-41ad-ba62-774fe64b7882","token":"eyJ0eXAiOiJKV1QiL....dQVyEpXt64Fznudfg","workOrder":{"members":{"lShopFloorLoad":true,"origStartDate":"2021-12-09T00:00:00","origRequiredQty":2,"requiredQty":2,"requiredDate":"2021-12-09T00:00:00","origRequiredDate":"2021-12-09T00:00:00","statusCode":"Released","imaItemName":"Solid Pin - Black","startDate":"2021-12-09T00:00:00","reference":"HS790022053","itemId":"13840402"}},"socketId":"3b9gejTZjAXsnEITAAvB","type":"Create WO"}"
HSET "bull:webApi:4822" "delay" "0"
HSET "bull:webApi:4822" "priority" "0"
HSET "bull:webApi:4822" "name" "__default__"
HSET "bull:webApi:4822" "opts" "{"lifo":true,"attempts":1,"delay":0,"timestamp":1639085540683}"
HSET "bull:webApi:4822" "finishedOn" "1639085623934"
You can see in this case it took 83 seconds to process. (1639085540 - 1639085623)
I'd like to be able to provide summary metrics like:
Most recent API call was added to queue X seconds ago
Most recent successful API call completed X seconds ago and took XX seconds to
complete.
I'd also like to be able to provide a list of the 50 most recent API calls, formatted in a nice way and tagged with "success", "pending", or "failed".
I'm fairly new to Redis and Bull, and I'm trying to figure out how to query this data (using Redis in Node.js) and return this data as JSON to the application.
I can pull a list of keys like this:
// #route GET /status
async function status(req, res) {
const client = createClient({
url: `redis://${REDIS_SERVER}:6379`
});
try {
client.on('error', (err) => console.log('Redis Client Error', err));
await client.connect();
const value = await client.keys('*');
res.json(value)
} catch (error) {
console.log('ERROR getting status: ', error.message, new Date())
res.status(500).json({ message: error.message })
} finally {
client.quit()
}
}
Which will return ["bull:webApi:3","bull:webApi:1","bull:webApi:2"...]
But how can I pull the values associated to the respective keys?
And how can I find the key with the highest number, and then pull the details for the "last 50". In SQL, it would be like doing a ORDER BY key_number DESC LIMIT 50 - but I'm not sure how to do it in Redis.

I'm a bit late here, but if you're not set on manually digging around in Redis, I think Bull's API, in particular Queue#getJobs(), has everything you need here, and should be much easier to work with. Generally, you shouldn't have to reach into Redis to do any common tasks like this, that's what Bull is for!
If I understand you goal correctly, you should be able to do something like:
const Queue = require('bull')
async function status (req, res) {
const { listNum = 10 } = req.params
const api_queue = new Queue('webApi', `redis://${REDIS_SERVER}:6379`)
const current_timestamp_sec = new Date().getTime() / 1000 // convert to seconds
const recent_jobs = await api_queue.getJobs(null, 0, listNum)
const results = recent_jobs.map(job => {
const processed_on_sec = job.processedOn / 1000
const finished_on_sec = job.finishedOn / 1000
return {
request_data: job.data,
return_data: job.returnvalue,
processedOn: processed_on_sec,
finishedOn: finished_on_sec,
duration: finished_on_sec - processed_on_sec,
elapsedSinceStart: current_timestamp_sec - processed_on_sec,
elapsedSinceFinished: current_timestamp_sec - finished_on_sec
}
})
res.json(results)
}
That will get you the most recent numList* jobs in your queue. I haven't tested this full code, and I'll leave the error handling and adding of your custom fields to the job data up to you, but the core of it is solid and I think that should meet your needs without ever having to think about how Bull stores things in Redis.
I also included a suggestion on how to deal with timestamps a bit more nicely, you don't need to do string processing to convert milliseconds to seconds. If you need them to be integers you can wrap them in Math.floor().
* at least that many, anyway - see the second note below
A couple notes:
The first argument of getJobs() is a list of statuses, so if you want to look at just completed jobs, you can pass ['completed'], or completed and active, do ['completed', 'active'], etc. If no list is provided (null) it defaults to all statuses.
As mentioned in the reference I linked, the limit here is per state - so you'll likely get more than listNum jobs back. It doesn't seem like that should be a problem for your use case, but if it is, you can sort the list returned (probably by job id) and just return the first listNum - you're guaranteed to get at least that many (assuming there are that many jobs in your queue), and won't get more than 6*listNum (since there are 6 states).
Folks new to Bull can get nervous about instantiating a Queue object to do stuff like this - but don't be! By itself a Queue instance doesn't do anything, it's just an interface to the given queue. It won't start processing jobs until you call process() to add a processor. This is, incidentally, also how you'd add jobs from a separate process than you run your queues in, but of course nothing will be added unless you call add().

So I've figured out how to pull the data I need. I'm not saying it's a good method, and I'm open to suggestions; but it seems to work to provide a filtered JSON return with the needed data, without changing how the queue functions.
Here's what it looks like:
// #route GET /status/:listNum
async function status(req, res) {
const { listNum = 10} = req.params
const client = createClient({
url: `redis://${REDIS_SERVER}:6379`
});
try {
client.on('error', (err) => console.log('Redis Client Error', err));
await client.connect();
// Find size of queue database
const total_keys = await client.sendCommand(['DBSIZE']);
const upper_value = total_keys;
const lower_value = total_keys - listNum;
// Generate array
const range = (start, stop) => Array.from({ length: (start - stop) + 1}, (_, i) => start - (i));
var queue_ids = range(upper_value, lower_value)
queue_ids = queue_ids.filter(function(x){ return x > 0 }); // Filer out anything less than zero
// Current timestamp in seconds
const current_timestamp = parseInt(String(new Date().getTime()).slice(0, -3)); // remove microseconds ("now")
var response = []; // Initialize array
for(id of queue_ids){ // Loop through queries
// Query value
var value = await client.HGETALL('bull:webApi:'+id);
if(Object.keys(value).length !== 0){ // if returned a value
// Grab most of the request (exclude the token & socketId to save space, not used)
var request_data = JSON.parse(value.data)
request_data.token = '';
request_data.socketId = '';
// Grab & calculate desired times
const processedOn = value.processedOn.slice(0, -3); // remove microseconds ("start")
const finishedOn = value.finishedOn.slice(0, -3); // remove microseconds ("done")
const duration = finishedOn - processedOn; // (seconds)
const elapsedSinceStart = current_timestamp - processedOn;
const elapsedSinceFinished = current_timestamp - finishedOn;
// Grab the returnValue
const return_data = value.returnValue;
// ignoring queue keys of: opts, priority, delay, name, timestamp
const object_data = {request_data: request_data, processedOn: processedOn, finishedOn: finishedOn, return_data: return_data, duration: duration, elapsedSinceStart: elapsedSinceStart, elapsedSinceFinished: elapsedSinceFinished }
response.push(object_data);
}
}
res.json(response);
} catch (error) {
console.log('ERROR getting status: ', error.message, new Date());
res.status(500).json({ message: error.message });
} finally {
client.quit();
}
}
It's looping the Redis query, so I wouldn't want to use this for hundreds of keys, but for 10 or even 50 I'm thinking it should work.
For now I've resorted to getting the total number of keys and working backwards:
await client.sendCommand(['DBSIZE']);
In my case it will return a total number slightly higher than the highest key id (~ a handful of status keys), but at least gets close, and then I just filter out any non-responses.
I've looked at ZRANGE a bit, but I can't figure out how to get it to give me the last id. When I have a Redis database (Bull Queue) like this:
If there's a simple Redis command I can run that will return "3", I'd probably use that instead. (since bull:webApi:3 has the highest number)
(In actual use case, this might be 9555 or some high number; I just want to get the highest numbered key that exists.)
For now I'll try using the method I've come up with above.

Using firebase cloud function to copy all documents from a master collection in Firestore to new sub-collection

I have a master collection in firestore with a couple hundred documents (which will grow to a few thousand in a couple of months).
I have a use case, where every time a new user document is created in /users/ collection, I want all the documents from the master to be copied over to /users/{userId}/.
To achieve this, I have created a firebase cloud function as below:
// setup for new user
exports.setupCollectionForUser = functions.firestore
.document('users/{userId}')
.onCreate((snap, context) => {
const userId = context.params.userId;
db.collection('master').get().then(snapshot => {
if (snapshot.empty) {
console.log('no docs found');
return;
}
snapshot.forEach(function(doc) {
return db.collection('users').doc(userId).collection('slave').doc(doc.get('uid')).set(doc.data());
});
});
});
This works, the only problem is, it takes forever (~3-5 mins) for only about 200 documents. This has been such a bummer because a lot depends on how fast these documents get copied over. I was hoping this to be not more than a few seconds at max. Also, the documents show up altogether and not as they are written, or at least they seem that way.
Am I doing anything wrong? Why should it take so long?
Is there a way I can break this operation into multiple reads and writes so that I can guarantee a minimum documents in a few seconds and not wait until all of them are copied over?
Please advise.

If I am not mistaking, by correctly managing the parallel writes with Promise.all() and returning the Promises chain it should normally improve the speed.
Try to adapt your code as follows:
exports.setupCollectionForUser = functions.firestore
.document('users/{userId}')
.onCreate((snap, context) => {
const userId = context.params.userId;
return db.collection('master').get().then(snapshot => {
if (snapshot.empty) {
console.log('no docs found');
return null;
} else {
const promises = [];
const slaveRef = db.collection('users').doc(userId).collection('slave');
snapshot.forEach(doc => {
promises.push(slaveRef.doc(doc.get('uid')).set(doc.data()))
});
return Promise.all(promises);
}
});
});
I would suggest you watch the 3 videos about "JavaScript Promises" from the Firebase video series which explain why it is key to return a Promise or a value in a background triggered Cloud Function.
Note also, that if you are sure you have less than 500 documents to save in the slave collection, you could use a batched write. (You could use it for more than 500 docs but then you would have to manage different batches of batched write...)

Get all messages from AWS SQS in NodeJS

I have the following function that gets a message from aws SQS, the problem is I get one at a time and I wish to get all of them, because I need to check the ID for each message:
function getSQSMessages() {
const params = {
QueueUrl: 'some url',
};
sqs.receiveMessage(params, (err, data) => {
if(err) {
console.log(err, err.stack)
return(err);
}
return data.Messages;
});
};
function sendMessagesBack() {
return new Promise((resolve, reject) => {
if(Array.isArray(getSQSMessages())) {
resolve(getSQSMessages());
} else {
reject(getSQSMessages());
};
});
};
The function sendMessagesBack() is used in another async/await function.
I am not sure how to get all of the messages, as I was looking on how to get them, people mention loops but I could not figure how to implement it in my case.
I assume I have to put sqs.receiveMessage() in a loop, but then I get confused on what do I need to check and when to stop the loop so I can get the ID of each message?
If anyone has any tips, please share.
Thank you.

I suggest you to use the Promise api, and it will give you the possibility to use async/await syntax right away.
const { Messages } = await sqs.receiveMessage(params).promise();
// Messages will contain all your needed info
await sqs.sendMessage(params).promise();
In this way, you will not need to wrap the callback API with Promises.

SQS doesn't return more than 10 messages in the response. To get all the available messages, you need to call the getSQSMessages function recursively.
If you return a promise from getSQSMessages, you can do something like this.
getSQSMessages()
.then(data => {
if(!data.Messages || data.Messages.length === 0){
// no messages are available. return
}
// continue processing for each message or push the messages into array and call
//getSQSMessages function again.
});

You can never be guaranteed to get all the messages in a queue, unless after you get some of them, you delete them from the queue - thus ensuring that the next requests returns a different selection of records.
Each request will return 'upto' 10 messages, if you don't delete them, then there is a good chance that the next request for 'upto' 10 messages will return a mix of messages you have already seen, and some new ones - so you will never really know when you have seen them all.
It maybe that a queue is not the right tool to use in your use case - but since I don't know your use case, its hard to say.

I know this is a bit of a necro but I landed here last night while trying to pull some all messages from a dead letter queue in SQS. While the accepted answer, "you cannot guarantee to get all messages" from the queue is absolutely correct I did want to drop an answer for anyone that may land here as well and needs to get around the 10 message limit per request from AWS.
Dependencies
In my case I have a few dependencies already in my project that I used to make life simpler.
lodash - This is something we use in our code for help making things functional. I don't think I used it below but I'm including it since it's in the file.
cli-progress - This gives you a nice little progress bar on your CLI.
Disclaimer
The below was thrown together during troubleshooting some production errors integrating with another system. Our DLQ messages contain some identifiers that I need in order to formulate cloud watch queries for troubleshooting. Given that these are two different GUIs in AWS switching back and forth is cumbersome given that our AWS session are via a form of federation and the session only lasts for one hour max.
The script
#!/usr/bin/env node
const _ = require('lodash');
const aswSdk = require('aws-sdk');
const cliProgress = require('cli-progress');
const queueUrl = 'https://[put-your-url-here]';
const queueRegion = 'us-west-1';
const getMessages = async (sqs) => {
const resp = await sqs.receiveMessage({
QueueUrl: queueUrl,
MaxNumberOfMessages: 10,
}).promise();
return resp.Messages;
};
const main = async () => {
const sqs = new aswSdk.SQS({ region: queueRegion });
// First thing we need to do is get the current number of messages in the DLQ.
const attributes = await sqs.getQueueAttributes({
QueueUrl: queueUrl,
AttributeNames: ['All'], // Probably could thin this down but its late
}).promise();
const numberOfMessage = Number(attributes.Attributes.ApproximateNumberOfMessages);
// Next we create a in-memory cache for the messages
const allMessages = {};
let running = true;
// Honesty here: The examples we have in existing code use the multi-bar. It was about 10PM and I had 28 DLQ messages I was looking into. I didn't feel it was worth converting the multi-bar to a single-bar. Look into the docs on the github page if this is really a sticking point for you.
const progress = new cliProgress.MultiBar({
format: ' {bar} | {name} | {value}/{total}',
hideCursor: true,
clearOnComplete: true,
stopOnComplete: true
}, cliProgress.Presets.shades_grey);
const progressBar = progress.create(numberOfMessage, 0, { name: 'Messages' });
// TODO: put in a time limit to avoid an infinite loop.
// NOTE: For 28 messages I managed to get them all with this approach in about 15 seconds. When/if I cleanup this script I plan to add the time based short-circuit at that point.
while (running) {
// Fetch all the messages we can from the queue. The number of messages is not guaranteed per the AWS documentation.
let messages = await getMessages(sqs);
for (let i = 0; i < messages.length; i++) {
// Loop though the existing messages and only copy messages we have not already cached.
let message = messages[i];
let data = allMessages[message.MessageId];
if (data === undefined) {
allMessages[message.MessageId] = message;
}
}
// Update our progress bar with the current progress
const discoveredMessageCount = Object.keys(allMessages).length;
progressBar.update(discoveredMessageCount);
// Give a quick pause just to make sure we don't get rate limited or something
await new Promise((resolve) => setTimeout(resolve, 1000));
running = discoveredMessageCount !== numberOfMessage;
}
// Now that we have all the messages I printed them to console so I could copy/paste the output into LibreCalc (excel-like tool). I split on the semicolon for rows out of habit since sometimes similar scripts deal with data that has commas in it.
const keys = Object.keys(allMessages);
console.log('Message ID;ID');
for (let i = 0; i < keys.length; i++) {
const message = allMessages[keys[i]];
const decodedBody = JSON.parse(message.Body);
console.log(`${message.MessageId};${decodedBody.id}`);
}
};
main();

Firestore cloud function asynchronous execution with promise

I have orders collection and products collection in my application. The user can have multiple products in their single order. What I want to do is calculating the amount of each product reading through products collection and then perform the further action. Below is what I got as of now.
exports.myfunc = functions.firestore.document('collection/{collid}')
.onCreate(event => {
let data = event.data.data();
const products = data.products;
const prices = [];
_.each(products, (data1, index) => {
const weight = data1.weight;
const isLess = data1.isLess;
firebaseAdmin.firestore().collection('collection').doc(data1.productId).onSnapshot(data2 => {
let amount = weight === '1/2' ? data2.data().price1 : data2.data().price1 * weight;
amount += isLess ? 50 : 0;
prices.push(amount);
});
});
//Do some task after _.each with new total
});
But am not able to achieve synchronous task here, so that I can store actual amount for the product against its order and calculate total to store in document.
Could anyone please tell me how I achieve the above-said scenarios? How I can work along with promise and then callback?

You can map the products array to promises, like this:
var productPromises = products.map(product => {
return new Promise((resolve, reject) => {
firebaseOperation()...onSnapshot(resolve)
})
})
Promise.all(productPromises).then(results => {
// process all results at once
})

First, don't use onSnapshot() with Cloud Functions. That attaches a listener that stay listening indefinitely, until you remove it. That's not what you want at all, because functions can't execute indefinitely.
Instead, use get(), which returns a promise when the fetch is complete.
Also, you could consider accumulating all the documents you want to access into an array and use getAll() (with the spread operator on the array) to fetch them all.

A persistent multiset that is built up sequentially and processed at once for Node.js

In Node.js I am trying to get the following behaviour: During the runtime of my Express application I accumulate several IDs of objects that need further processing. For further processing, I need to transmit these IDs to a different service. The other service however, cannot handle a lot of requests, but rather requires batch transmission. Hence I need to accumulate a lot of individual requests to a bigger one while allowing persistence.
tl;dr — Over 15 minutes, several IDs are accumulated in my application, then after this 15 minute-window, they're all emitted at once. At the same time, the next windows is opened.
From my investigation, this is probably the abstract data type of a multiset: The items in my multiset (or bag) can have duplicates (hence the multi-), they're packaged by the time window, but have no index.
My infrastructure is already using redis, but I am not sure whether there's a way to accumulate data into one single job. Or is there? Are there any other sensible ways to achieve this kind of behavior?

I might be misunderstanding some of the subtlety of your particular situation, but here goes.
Here is a simple sketch of some code that processes a batch of 10 items at a time. The way you would do this differs slightly depending on whether the processing step is synchronous or asynchronous. You don't need anything more complicated than an array for this, since arrays have constant time push and length methods and that is the only thing you need to do. You may want to add another option to flush the batch after a given item in inserted.
Synchronous example:
var batch = [];
var batchLimit = 10;
var sendItem = function (item) {
batch.push(item);
if (item.length >= batchLimit) {
processBatchSynch(batch);
batch = [];
}
}
Asynchronous example:
// note that in this case the job of emptying the batch array
// has to be done inside the callback.
var batch = [];
var batchLimit = 10;
// your callback might look something like function(err, data) { ... }
var sendItem = function (item, cb) {
batch.push(item);
if (item.length >= batchLimit) {
processBatchAsync(batch, cb);
}
}

I came up with a npm module to solve this specific problem using a MySQL database for persistence:
persistent-bag: This is to bags like redis is to queues. A bag (or multiset) is filled over time and processed at once.
On instantiation of the object, the required table is created in the provided MySQL database if necessary.
var PersistentBag = require('persistent-bag');
var bag = new PersistentBag({
host: 'localhost',
port: '3306',
user: 'root',
password: '',
database: 'test'
});
Then items can be .add()ed during the runtime of any number of applications:
var item = {
title: 'Test item to store to bag'
};
bag.add(item, function (err, itemId) {
console.log('Item id: ' + itemId);
});
Working with the emitted, aggregated items every 15 minutes is done like in kue for redis by subscribing to .process():
bag.process(function worker(bag, done) {
// bag.data is now an array of all items
doSomething(bag.data, function () {
done();
});
});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Avoid Timeout to read 18000 document on Firestore - node.js

Related

Ability to provide insights from Redis Bull Queue data

Using firebase cloud function to copy all documents from a master collection in Firestore to new sub-collection

Get all messages from AWS SQS in NodeJS

Firestore cloud function asynchronous execution with promise

A persistent multiset that is built up sequentially and processed at once for Node.js

Categories

Resources