Using batch to recursively update documents only works on small collection - node.js

I have a collection of teams containing around 80 000 documents. Every Monday I would like to reset the scores of every team using firebase cloud functions. This is my function:
exports.resetOrgScore = functions.runWith(runtimeOpts).pubsub.schedule("every monday 00:00").timeZone("Europe/Oslo").onRun(async (context) => {
let batch = admin.firestore().batch();
let count = 0;
let overallCount = 0;
const orgDocs = await admin.firestore().collection("teams").get();
orgDocs.forEach(async(doc) => {
batch.update(doc.ref, {score:0.0});
if (++count >= 500 || ++overallCount >= orgDocs.docs.length) {
await batch.commit();
batch = admin.firestore().batch();
count = 0;
}
});
});
I tried running the function in a smaller collection of 10 documents and it's working fine, but when running the function in the "teams" collection it returns "Cannot modify a WriteBatch that has been committed". I tried returning the promise like this(code below) but that doesn't fix the problem. Thanks in advance :)
return await batch.commit().then(function () {
batch = admin.firestore().batch();
count = 0;
return null;
});

There are three problems in your code:
You use async/await with forEach() which is not recommended: The problem is that the callback passed to forEach() is not being awaited, see more explanations here or here.
As detailed in the error you "Cannot modify a WriteBatch that has been committed". With await batch.commit(); batch = admin.firestore().batch(); it's exactly what you are doing.
As important, you don't return the promise returned by the asynchronous methods. See here for more details.
You'll find in the doc (see Node.js tab) a code which allows to delete, by recursively using a batch, all the docs of a collection. It's easy to adapt it to update the docs, as follows. Note that we use a dateUpdated flag to select the docs for each new batch: with the original code, the docs were deleted so no need for a flag...
const runtimeOpts = {
timeoutSeconds: 540,
memory: '1GB',
};
exports.resetOrgScore = functions
.runWith(runtimeOpts)
.pubsub
.schedule("every monday 00:00")
.timeZone("Europe/Oslo")
.onRun((context) => {
return new Promise((resolve, reject) => {
deleteQueryBatch(resolve).catch(reject);
});
});
async function deleteQueryBatch(resolve) {
const db = admin.firestore();
const snapshot = await db
.collection('teams')
.where('dateUpdated', '==', "20210302")
.orderBy('__name__')
.limit(499)
.get();
const batchSize = snapshot.size;
if (batchSize === 0) {
// When there are no documents left, we are done
resolve();
return;
}
// Delete documents in a batch
const batch = db.batch();
snapshot.docs.forEach((doc) => {
batch.update(doc.ref, { score:0.0, dateUpdated: "20210303" });
});
await batch.commit();
// Recurse on the next process tick, to avoid
// exploding the stack.
process.nextTick(() => {
deleteQueryBatch(resolve);
});
}
Note that the above Cloud Function is configured with the maximum value for the time out, i.e. 9 minutes.
If it appears that all your docs cannot be updated within 9 minutes, you will need to find another approach, for example using the Admin SDK from one of your server, or cutting the work into pieces and run the CF several times.

Related

Correct way to organise this process in Node

I need some advice on how to structure this function as at the moment it is not happening in the correct order due to node being asynchronous.
This is the flow I want to achieve; I don't need help with the code itself but with the order to achieve the end results and any suggestions on how to make it efficient
Node routes a GET request to my controller.
Controller reads a .csv file on local system and opens a read stream using fs module
Then use csv-parse module to convert that to an array line by line (many 100,000's of lines)
Start a try/catch block
With the current row from the csv, take a value and try to find it in a MongoDB
If found, take the ID and store the line from the CSV and this id as a foreign ID in a separate database
If not found, create an entry into the DB and take the new ID and then do 6.
Print out to terminal the row number being worked on (ideally at some point I would like to be able to send this value to the page and have it update like a progress bar as the rows are completed)
Here is a small part of the code structure that I am currently using;
const fs = require('fs');
const parse = require('csv-parse');
function addDataOne(req, id) {
const modelOneInstance = new InstanceOne({ ...code });
const resultOne = modelOneInstance.save();
return resultOne;
}
function addDataTwo(req, id) {
const modelTwoInstance = new InstanceTwo({ ...code });
const resultTwo = modelTwoInstance.save();
return resultTwo;
}
exports.add_data = (req, res) => {
const fileSys = 'public/data/';
const parsedData = [];
let i = 0;
fs.createReadStream(`${fileSys}${req.query.file}`)
.pipe(parse({}))
.on('data', (dataRow) => {
let RowObj = {
one: dataRow[0],
two: dataRow[1],
three: dataRow[2],
etc,
etc
};
try {
ModelOne.find(
{ propertyone: RowObj.one, propertytwo: RowObj.two },
'_id, foreign_id'
).exec((err, searchProp) => {
if (err) {
console.log(err);
} else {
if (searchProp.length > 1) {
console.log('too many returned from find function');
}
if (searchProp.length === 1) {
addDataOne(RowObj, searchProp[0]).then((result) => {
searchProp[0].foreign_id.push(result._id);
searchProp[0].save();
});
}
if (searchProp.length === 0) {
let resultAddProp = null;
addDataTwo(RowObj).then((result) => {
resultAddProp = result;
addDataOne(req, resultAddProp._id).then((result) => {
resultAddProp.foreign_id.push(result._id);
resultAddProp.save();
});
});
}
}
});
} catch (error) {
console.log(error);
}
i++;
let iString = i.toString();
process.stdout.clearLine();
process.stdout.cursorTo(0);
process.stdout.write(iString);
})
.on('end', () => {
res.send('added');
});
};
I have tried to make the functions use async/await but it seems to conflict with the fs.openReadStream or csv parse functionality, probably due to my inexperience and lack of correct use of code...
I appreciate that this is a long question about the fundamentals of the code but just some tips/advice/pointers on how to get this going would be appreciated. I had it working when the data was sent one at a time via a post request from postman but can't implement the next stage which is to read from the csv file which contains many records
First of all you can make the following checks into one query:
if (searchProp.length === 1) {
if (searchProp.length === 0) {
Use upsert option in mongodb findOneAndUpdate query to update or upsert.
Secondly don't do this in main thread. Use a queue mechanism it will be much more efficient.
Queue which I personally use is Bull Queue.
https://github.com/OptimalBits/bull#basic-usage
This also provides the functionality you need of showing progress.
Also regarding using Async Await with ReadStream, a lot of example can be found on net such as : https://humanwhocodes.com/snippets/2019/05/nodejs-read-stream-promise/

Pub/Sub Cloud Function does not Update Document in Subcollection

I am trying to update a field in my document in Firestore. The general location of the document would be "/games/{userId}/userGames/{gameId}. And in this game, there is a property called "status" which changes accordingly to the games start and end time.
As you can guess, the if the start time is bigger than the "now" timestamp and the status is "TO_BE_PLAYED", the game will begin and the status will be 1, "BEING_PLAYED". Also, if the end time is bigger than the "now" timestamp and the status is "BEING_PLAYED", the game will end, therefore the status will be 2, "PLAYED". I want to create a cloud function that is capable to do so.
However, even if the function logs output 'ok', the values are never updated. Unfortunately, I do not have that much experience in Javascript too.
THE CODE
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp();
const STATUS_PLAYED = 2;
const STATUS_BEING_PLAYED = 1;
const STATUS_TO_BE_PLAYED = 0;
exports.handleBeingPlayedGames = functions.runWith({memory: "2GB"}).pubsub.schedule('* * * * *')
.timeZone('Europe/Istanbul') // Users can choose timezone - default is America/Los_Angeles
.onRun(async () => {
// current time & stable
// was Timestamp.now();
const now = admin.firestore.Timestamp.fromDate( new Date());
const querySnapshot = await db.collection("games").get();
const promises = [];
querySnapshot.forEach( doc => {
const docRef = doc.ref;
console.log(docRef);
promises.push(docRef.collection("userGames").where("status", "==", STATUS_BEING_PLAYED).where("endtime", "<", now).get());
});
const snapshotArrays = await Promise.all(promises);
const promises1 = [];
snapshotArrays.forEach( snapArray => {
snapArray.forEach(snap => {
promises1.push(snap.ref.update({
"status": STATUS_PLAYED,
}));
});
});
return Promise.all(promises1);
});
exports.handleToBePlayedGames = functions.runWith({memory: "2GB"}).pubsub.schedule('* * * * *')
.onRun(async () => {
// current time & stable
// was Timestamp.now();
const now = admin.firestore.Timestamp.fromDate(new Date());
const querySnapshot = await db.collection("games").get();
const promises = [];
querySnapshot.forEach( async doc => {
const docData = await doc.ref.collection("userGames").where("status", "==", STATUS_TO_BE_PLAYED).where("startTime", ">", now).get();
promises.push(docData);
});
const snapshotArrays = await Promise.all(promises);
const promises1 = [];
snapshotArrays.forEach( snapArray => {
snapArray.forEach(snap => {
promises1.push(snap.ref.update({
"status": STATUS_BEING_PLAYED,
}));
});
});
return Promise.all(promises1);
});
Okay, so this answer goes to lurkers trying to solve this problem.
First I tried to solve this problem by brute force and not including much thinking and tried to acquire the value in subcollection. However, as I searched, I've found that denormalizing (flattening) data actually solves the problem a bit.
I created a new directory under /status/{gameId} with the properties
endTime, startTime, and status field and I actually did it on a single level by using promises. Sometimes denormalizing data can be your savior.
How can startTime be greater than now? Is it set by default to a date in the future?
My current assumption is that a game cannot set it's status to STATUS_BEING_PLAYED because of the inconsistency with startTime. Moreover, a game cannot have the status STATUS_PLAYED because it depends on having STATUS_BEING_PLAYED, which cannot have.
My recommendation would be to set the field startTime and endTime to null by default. If you do so you can check if a game has to be set to STATUS_BEING_PLAYED with this:
doc.ref.collection("userGames")
.where("status", "==", STATUS_TO_BE_PLAYED)
.where("startTime", "<", now)
.where("endTime", "==", null)
.get();
You could check if a game has to be on STATUS_PLAYED with this (exactly as you did):
docRef.collection("userGames")
.where("status", "==", STATUS_BEING_PLAYED)
.where("endtime", "<", now)
.get();
Now there's something that you should wonder, is this the best approach to change a game's status? You are querying the whole game library of a user every single minute as you know read operations are charged so this approach would imply meaningful charges. Maybe you should simply use update the game's status when the game is started and closed.
Also notice that the equals operation is ==, not =.

Index messed up if I upload more than one file at once

I've got the following firebase function to run once a file is uploaded to firebase storage.
It basically gets its URL and saves a reference to it in firestore. I need to save them in a way so that I can query them randomly from my client. Indexes seem to be to best fit this requirement.
for the firestore reference I need the following things:
doc ids must go from 0 to n (n beeing the index of the last
document)
have a --stats-- doc keeping track of n (gets
incremented every time a document is uploaded)
To achieve this I've written the following node.js script:
const incrementIndex = admin.firestore.FieldValue.increment(1);
export const image_from_storage_to_firestore = functions.storage
.object()
.onFinalize(async object => {
const bucket = gcs.bucket(object.bucket);
const filePath = object.name;
const splittedPath = filePath!.split("/");
// se siamo nelle immagini
// path = emotions/$emotion/photos/$photographer/file.jpeg
if (splittedPath[0] === "emotions" && splittedPath[2] === "photos") {
const emotion = splittedPath[1];
const photographer = splittedPath[3];
const file = bucket.file(filePath!);
const indexRef = admin.firestore().collection("images")
.doc("emotions").collection(emotion).doc("--stats--");
const index = await indexRef.get().then((doc) => {
if (!doc.exists) {
return 0;
} else {
return doc.data()!.index;
}
});
if (index === 0) {
await admin.firestore().collection("images")
.doc("emotions")
.collection(emotion)
.doc("--stats--")
.set({index: 0});
}
console.log("(GOT INDEX): " + index);
let imageURL;
await file
.getSignedUrl({
action: "read",
expires: "03-09-2491"
})
.then(signedUrls => {
imageURL = signedUrls[0];
});
console.log("(GOT URL): " + imageURL);
var docRef = admin.firestore()
.collection("images")
.doc("emotions")
.collection(emotion)
.doc(String(index));
console.log("uploading...");
await indexRef.update({index: incrementIndex});
await docRef.set({ imageURL: imageURL, photographer: photographer });
console.log("finished");
return true;
}
return false;
});
Getting to the problem:
It works perfectly if I upload the files one by one.
It messes up the index if I upload more than one file at once, because two concurrent uploads will read the same index value from --stats-- and one will overwrite the other.
How would you solve this problem? would you use another approach instead of the indexed one?
You should use a Transaction in which you:
read the value of the index (from "--stats--" document),
write the new index and
write the value of the imageURL in the "emotion" doc.
See also the reference docs about transactions.
This way, if the index value is changed in the "--stats--" document while the Transaction is being executed, the Cloud Function can catch the Transaction failure and generates an error which finishes it.
In parallel, you will need to enable retries for this background Cloud Function, in order it is retried if the Transaction failed in a previous run.
See this documentation item https://firebase.google.com/docs/functions/retries, including the video from Doug Stevenson which is embedded in the doc.

using while loop with async function and setTimeout. Nodejs

I'm trying to create a test, to verify that I've put an item in a dynamoDB table. In order to do so, right after I make a call that should put an Item (vehicle) in the database, I am trying to get the vehicle from the DB.
In my test I want to have a maximum number of retries (5). I want this while loop to be block the thread until my query has resolved to give a vehicle, or tried 5 times. Inside my test I have:
let count = 0
let car
while (!car || count < 5) {
setTimeout(async () => {
car = await findVehicle(greenCar.vehicleInfo)
}, 3000)
count++
}
And findVehicle is an asynchronous function that does a get from the dynamoDB table
If you want to wait on each iteration you can do this:
let count = 0;
let car;
while (!car || count < 5) {
await new Promise((resolve) =>
setTimeout(async () => {
car = await findVehicle(greenCar.vehicleInfo);
resolve();
}, 3000));
count++
}
So you are resolving the promise you are awaiting after you get your data. Also your function must be async in order to use await. Hope this helps.

Firestore cloud function asynchronous execution with promise

I have orders collection and products collection in my application. The user can have multiple products in their single order. What I want to do is calculating the amount of each product reading through products collection and then perform the further action. Below is what I got as of now.
exports.myfunc = functions.firestore.document('collection/{collid}')
.onCreate(event => {
let data = event.data.data();
const products = data.products;
const prices = [];
_.each(products, (data1, index) => {
const weight = data1.weight;
const isLess = data1.isLess;
firebaseAdmin.firestore().collection('collection').doc(data1.productId).onSnapshot(data2 => {
let amount = weight === '1/2' ? data2.data().price1 : data2.data().price1 * weight;
amount += isLess ? 50 : 0;
prices.push(amount);
});
});
//Do some task after _.each with new total
});
But am not able to achieve synchronous task here, so that I can store actual amount for the product against its order and calculate total to store in document.
Could anyone please tell me how I achieve the above-said scenarios? How I can work along with promise and then callback?
You can map the products array to promises, like this:
var productPromises = products.map(product => {
return new Promise((resolve, reject) => {
firebaseOperation()...onSnapshot(resolve)
})
})
Promise.all(productPromises).then(results => {
// process all results at once
})
First, don't use onSnapshot() with Cloud Functions. That attaches a listener that stay listening indefinitely, until you remove it. That's not what you want at all, because functions can't execute indefinitely.
Instead, use get(), which returns a promise when the fetch is complete.
Also, you could consider accumulating all the documents you want to access into an array and use getAll() (with the spread operator on the array) to fetch them all.

Resources