Check if a document exists in mongoose (Node.js) - node.js

I have seen a number of ways of finding documents in mongoDB such that there is no performance hit, i.e. you don't really retrieve the document; instead you just retrieve a count of 1 or 0 if the document exists or not.
In mongoDB, one can probably do:
db.<collection>.find(...).limit(1).size()
In mongoose, you either have callbacks or not. But in both cases, you are retrieving the entries rather than checking the count. I simply want a way to check if a document exists in mongoose — I don't want the document per se.
EDIT: Now fiddling with the async API, I have the following code:
for (var i = 0; i < deamons.length; i++){
var deamon = deamons[i]; // get the deamon from the parsed XML source
deamon = createDeamonDocument(deamon); // create a PSDeamon type document
PSDeamon.count({deamonId: deamon.deamonId}, function(error, count){ // check if the document exists
if (!error){
if (count == 0){
console.log('saving ' + deamon.deamonName);
deamon.save() // save
}else{
console.log('found ' + deamon.leagueName);
}
}
})
}

You have to read about javascript scope. Anyway try the following code,
for (var i = 0; i < deamons.length; i++) {
(function(d) {
var deamon = d
// create a PSDeamon type document
PSDeamon.count({
deamonId : deamon.deamonId
}, function(error, count) {// check if the document exists
if (!error) {
if (count == 0) {
console.log('saving ' + deamon.deamonName);
// get the deamon from the parsed XML source
deamon = createDeamonDocument(deamon);
deamon.save() // save
} else {
console.log('found ' + deamon.leagueName);
}
}
})
})(deamons[i]);
}
Note: Since It includes some db operation, I am not tested.

I found it simpler this way.
let docExists = await Model.exists({key: value});
console.log(docExists);
Otherwise, if you use it inside a function, make sure the function is async.
let docHandler = async () => {
let docExists = await Model.exists({key: value});
console.log(docExists);
};

You can use count, it doesn't retrieve entries. It relies on mongoDB's count operation which:
Counts the number of documents in a collection.
Returns a document that contains this count and as well as the command status.

Related

Correct way to organise this process in Node

I need some advice on how to structure this function as at the moment it is not happening in the correct order due to node being asynchronous.
This is the flow I want to achieve; I don't need help with the code itself but with the order to achieve the end results and any suggestions on how to make it efficient
Node routes a GET request to my controller.
Controller reads a .csv file on local system and opens a read stream using fs module
Then use csv-parse module to convert that to an array line by line (many 100,000's of lines)
Start a try/catch block
With the current row from the csv, take a value and try to find it in a MongoDB
If found, take the ID and store the line from the CSV and this id as a foreign ID in a separate database
If not found, create an entry into the DB and take the new ID and then do 6.
Print out to terminal the row number being worked on (ideally at some point I would like to be able to send this value to the page and have it update like a progress bar as the rows are completed)
Here is a small part of the code structure that I am currently using;
const fs = require('fs');
const parse = require('csv-parse');
function addDataOne(req, id) {
const modelOneInstance = new InstanceOne({ ...code });
const resultOne = modelOneInstance.save();
return resultOne;
}
function addDataTwo(req, id) {
const modelTwoInstance = new InstanceTwo({ ...code });
const resultTwo = modelTwoInstance.save();
return resultTwo;
}
exports.add_data = (req, res) => {
const fileSys = 'public/data/';
const parsedData = [];
let i = 0;
fs.createReadStream(`${fileSys}${req.query.file}`)
.pipe(parse({}))
.on('data', (dataRow) => {
let RowObj = {
one: dataRow[0],
two: dataRow[1],
three: dataRow[2],
etc,
etc
};
try {
ModelOne.find(
{ propertyone: RowObj.one, propertytwo: RowObj.two },
'_id, foreign_id'
).exec((err, searchProp) => {
if (err) {
console.log(err);
} else {
if (searchProp.length > 1) {
console.log('too many returned from find function');
}
if (searchProp.length === 1) {
addDataOne(RowObj, searchProp[0]).then((result) => {
searchProp[0].foreign_id.push(result._id);
searchProp[0].save();
});
}
if (searchProp.length === 0) {
let resultAddProp = null;
addDataTwo(RowObj).then((result) => {
resultAddProp = result;
addDataOne(req, resultAddProp._id).then((result) => {
resultAddProp.foreign_id.push(result._id);
resultAddProp.save();
});
});
}
}
});
} catch (error) {
console.log(error);
}
i++;
let iString = i.toString();
process.stdout.clearLine();
process.stdout.cursorTo(0);
process.stdout.write(iString);
})
.on('end', () => {
res.send('added');
});
};
I have tried to make the functions use async/await but it seems to conflict with the fs.openReadStream or csv parse functionality, probably due to my inexperience and lack of correct use of code...
I appreciate that this is a long question about the fundamentals of the code but just some tips/advice/pointers on how to get this going would be appreciated. I had it working when the data was sent one at a time via a post request from postman but can't implement the next stage which is to read from the csv file which contains many records
First of all you can make the following checks into one query:
if (searchProp.length === 1) {
if (searchProp.length === 0) {
Use upsert option in mongodb findOneAndUpdate query to update or upsert.
Secondly don't do this in main thread. Use a queue mechanism it will be much more efficient.
Queue which I personally use is Bull Queue.
https://github.com/OptimalBits/bull#basic-usage
This also provides the functionality you need of showing progress.
Also regarding using Async Await with ReadStream, a lot of example can be found on net such as : https://humanwhocodes.com/snippets/2019/05/nodejs-read-stream-promise/

Using node.js and promise to fetch paginated data

Please keep in mind that I am new to node.js and I am used with android development.
My scenario is like this:
Run a query against the database that returns either null or a value
Call a web service with that database value, that offers info paginated, meaning that on a call I get a parameter to pass for the next call if there is more info to fetch.
After all the items are retrieved, store them in a database table
If everything is well, for each item received previously, I need to make another web call and store the retrieved info in another table
if fetching any of the data set fails, all data must be reverted from the database
So far, I've tried this:
getAllData: function(){
self.getMainWebData(null)
.then(function(result){
//get secondary data for each result row and insert it into database
}
}
getMainWebData: function(nextPage){
return new Promise(function(resolve, reject) {
module.getWebData(nextPage, function(errorReturned, response, values) {
if (errorReturned) {
reject(errorReturned);
}
nextPage = response.nextPageValue;
resolve(values);
})
}).then(function(result) {
//here I need to insert the returned values in database
//there's a new page, so fetch the next set of data
if (nextPage) {
//call again getMainWebData?
self.getMainWebData(nextPage)
}
})
There are a few things missing, from what I've tested, getAllData.then fires only one for the first set of items and not for others, so clearly handling the returned data in not right.
LATER EDIT: I've edited the scenario. Given some more research my feeling is that I could use a chain or .then() to perform the operations in a sequence.
Yes it is happening as you are resolving the promise on the first call itself. You should put resolve(value) inside an if statement which checks if more data is needed to be fetched. You will also need to restructure the logic as node is asynchronous. And the above code will not work unless you do change the logic.
Solution 1:
You can either append the paginated response to another variable outside the context of the calls you are making. And later use that value after you are done with the response.
getAllData: function(){
self.getMainWebData(null)
.then(function(result){
// make your database transaction if result is not an error
}
}
function getList(nextpage, result, callback){
module.getWebData(nextPage, function(errorReturned, response, values) {
if(errorReturned)
callback(errorReturned);
result.push(values);
nextPage = response.nextPageValue;
if(nextPage)
getList(nextPage, result, callback);
else
callback(null, result);
})
}
getMainWebData: function(nextPage){
return new Promise(function(resolve, reject) {
var result = [];
getList(nextpage, result, function(err, results){
if(err)
reject(err);
else{
// Here all the items are retrieved, you can store them in a database table
// for each item received make your web call and store it into another variable or result set
// suggestion is to make the database transaction only after you have retrieved all your data
// other wise it will include database rollback which will depend on the database which you are using
// after all this is done resolve the promise with the returning value
resolve(results);
}
});
})
}
I have not tested it but something like this should work. If problem persists let me know in comments.
Solution 2:
You can remove promises and try the same thing with callback as they are easier to follow and will make sense to the programmers who are familiar with structural languages.
Looking at your problem, I have created a code that would loop through promises.
and would only procede if there is more data to be fetched, the stored data would still be available in an array.
I hope this help. Dont forget to mark if it helps.
let fetchData = (offset = 0, limit= 10) => {
let addresses = [...Array(100).keys()];
return Promise.resolve(addresses.slice(offset, offset + limit))
}
// o => offset & l => limit
let o = 0, l = 10;
let results = [];
let process = p => {
if (!p) return p;
return p.then(data => {
// Process with data here;
console.log(data);
// increment the pagination
o += l;
results = results.concat(data);
// while there is data equal to limit set then fetch next page
// otherwise return the collected result
return (data.length == l)? process(fetchAddress(o, l)).then(data => data) : results;
})
}
process(fetchAddress(o, l))
.then(data => {
// All the fetched data will be here
}).catch(err => {
// Handle Error here.
// All the retrieved data from database will be available in "results" array
});
if You want to do it more often I have also created a gist for reference.
If You dont want to use any global variable, and want to do it in very functional way. You can check this example. However it requires little more complication.

Azure documentdb bulk insert using stored procedure

Hi I am using 16 collections to insert around 3-4 million json objects ranging from 5-10k per object.I am using stored procedure to insert these documents.I have 22 Capacity Unit.
function bulkImport(docs) {
var collection = getContext().getCollection();
var collectionLink = collection.getSelfLink();
// The count of imported docs, also used as current doc index.
var count = 0;
// Validate input.
if (!docs) throw new Error("The array is undefined or null.");
var docsLength = docs.length;
if (docsLength == 0) {
getContext().getResponse().setBody(0);
}
// Call the CRUD API to create a document.
tryCreateOrUpdate(docs[count], callback);
// Note that there are 2 exit conditions:
// 1) The createDocument request was not accepted.
// In this case the callback will not be called, we just call setBody and we are done.
// 2) The callback was called docs.length times.
// In this case all documents were created and we don't need to call tryCreate anymore. Just call setBody and we are done.
function tryCreateOrUpdate(doc, callback) {
var isAccepted = true;
var isFound = collection.queryDocuments(collectionLink, 'SELECT * FROM root r WHERE r.id = "' + doc.id + '"', function (err, feed, options) {
if (err) throw err;
if (!feed || !feed.length) {
isAccepted = collection.createDocument(collectionLink, doc, callback);
}
else {
// The metadata document.
var existingDoc = feed[0];
isAccepted = collection.replaceDocument(existingDoc._self, doc, callback);
}
});
// If the request was accepted, callback will be called.
// Otherwise report current count back to the client,
// which will call the script again with remaining set of docs.
// This condition will happen when this stored procedure has been running too long
// and is about to get cancelled by the server. This will allow the calling client
// to resume this batch from the point we got to before isAccepted was set to false
if (!isFound && !isAccepted) getContext().getResponse().setBody(count);
}
// This is called when collection.createDocument is done and the document has been persisted.
function callback(err, doc, options) {
if (err) throw err;
// One more document has been inserted, increment the count.
count++;
if (count >= docsLength) {
// If we have created all documents, we are done. Just set the response.
getContext().getResponse().setBody(count);
} else {
// Create next document.
tryCreateOrUpdate(docs[count], callback);
}
}
my C# codes looks like this
public async Task<int> Add(List<JobDTO> entities)
{
int currentCount = 0;
int documentCount = entities.Count;
while(currentCount < documentCount)
{
string argsJson = JsonConvert.SerializeObject(entities.Skip(currentCount).ToArray());
var args = new dynamic[] { JsonConvert.DeserializeObject<dynamic[]>(argsJson) };
// 6. execute the batch.
StoredProcedureResponse<int> scriptResult = await DocumentDBRepository.Client.ExecuteStoredProcedureAsync<int>(sproc.SelfLink, args);
// 7. Prepare for next batch.
int currentlyInserted = scriptResult.Response;
currentCount += currentlyInserted;
}
return currentCount;
}
The problem I am facing is out of 400k documents that I try to insert at times documents get missed with out giving any error.
The application is worker role deployed on cloud.
If I increase the number of threads or instances inserting in documentDB the number of documents missed are much higher.
how to figure out what is the problem.Thanks in Advance.
I found that when trying this code I would get an error at docs.length which stated that length was undefined.
function bulkImport(docs) {
var collection = getContext().getCollection();
var collectionLink = collection.getSelfLink();
// The count of imported docs, also used as current doc index.
var count = 0;
// Validate input.
if (!docs) throw new Error("The array is undefined or null.");
var docsLength = docs.length; // length is undefined
}
After many tests (could not find anything in Azure documentation) I realized that I could not pass an array as was suggested. The parameter had to be an object. I had to modify the batch code like this in order for it to run.
I also found I could not simply try and pass an array of documents in the DocumentDB script explorer (Input box) either. Even though the placeholder help text says you can.
This code worked for me:
// psuedo object for reference only
docObject = {
"items": [{doc}, {doc}, {doc}]
}
function bulkImport(docObject) {
var context = getContext();
var collection = context.getCollection();
var collectionLink = collection.getSelfLink();
var count = 0;
// Check input
if (!docObject.items || !docObject.items.length) throw new Error("invalid document input parameter or undefined.");
var docs = docObject.items;
var docsLength = docs.length;
if (docsLength == 0) {
context.getResponse().setBody(0);
}
// Call the funct to create a document.
tryCreateOrUpdate(docs[count], callback);
// Obviously I have truncated this function. The above code should help you understand what has to change.
}
Hopefully Azure documentation will catch up or become easier to find if I missed it.
I'll also be placing a bug report for the Script Explorer in hopes that the Azurites will update.
It’s important to note that stored procedures have bounded execution, in which all operations must complete within the server specified request timeout duration. If an operation does not complete with that time limit, the transaction is automatically rolled back. In order to simplify development to handle time limits, all CRUD (Create, Read, Update, and Delete) operations return a Boolean value that represents whether that operation will complete. This Boolean value can be used a signal to wrap up execution and for implementing a continuation based model to resume execution (this is illustrated in our code samples below).
The bulk-insert stored procedure provided above implements the continuation model by returning the number of documents successfully created. This is noted in the stored procedure's comments:
// If the request was accepted, callback will be called.
// Otherwise report current count back to the client,
// which will call the script again with remaining set of docs.
// This condition will happen when this stored procedure has been running too long
// and is about to get cancelled by the server. This will allow the calling client
// to resume this batch from the point we got to before isAccepted was set to false
if (!isFound && !isAccepted) getContext().getResponse().setBody(count);
If the output document count is less than the input document count, you will need to re-run the stored procedure with the remaining set of documents.
Since May 2018 there is a new Batch SDK for Cosmos DB. There is a GitHub repo to get you started.
I have been able to import 100.000 records in 9 seconds. And using Azure Batch to fan out the inserts, I have done 19 mln records in 1m15s. This was on a 1.66mln RU/s collection, which you obviously can scale down after import.

Access and modify extern variable in MongoDB request

I have a problem in a nodeJS app with mongoDB, i'm trying to do a forum and for each topic i want a button to display every sub topics.
So i need to get everything in the request:
One array with main topics
Another map array with ['topic1'] containing sub topics
Without the mapping (not an actual problem) i have this:
Post.find({'path': path})
.exec(function (err, posts){
if(err)
console.log("Get post list:" + err);
else
{
var sub_posts = new Array; // Second array with sub topics
for (var i = 0; posts[i]; i++) //Here is the loop for each topic
{
var tmp_path = ... // Path and next request works
Post.find({'path': tmp_path}) // Here is the second request
.exec(function(err, bis_posts) {
if (err) console.log('Error loading subforum');
else sub_posts.push(bis_posts); // Affectation fail !!!
})
}
res.render(... 'post_list': posts, 'sub_posts': sub_posts); // Send result
}
})}
So i get it's a scope problem and i should use callback but with the loop i can't resolve this problem.
Sorry for my english and thanks for your answers !
I have no idea what you mean by "affectation fail", but it looks like you're calling res.render too early — callbacks are invoked asynchronously after your current context finishes executing, so when you call res.render(...) after your for loop has finished, your Post.find(...)... operations still haven't finished and their callbacks haven't been invoked, so sub_posts will be still empty.
My node.js and Mongo are rusty, so perhaps this isn't the canonical way to do it, but I'd add a counter to track the state of the pending requests and only call res.render when all subposts have been fetched:
var sub_posts = new Array;
var pending = 0;
for (var i = 0; posts[i]; i++)
{
var tmp_path = ...
Post.find({'path': tmp_path})
.exec(function(err, bis_posts) {
if (err) console.log('Error loading subforum');
else sub_posts.push(bis_posts);
pending -= 1;
if (!pending) {
// all pending subpost lookups finished, render the response:
res.render(... 'post_list': posts, 'sub_posts': sub_posts);
}
});
pending += 1;
}

Multiple queries inside mongodb query

I'm having an issue when trying to query based on the result of another query on mongodb.
I'm trying to make an initial query and then do another query for each one of the result of the first query. The reason I'm doing it like this is because I have two different collections and I need to join some data from one collection with the data of the other collection. In a SQL world I could easily do this with a JOIN, but as I'm using mongodb in this one I can't really use JOINs, so I guessed doing a for loop inside the first query's callback function would be the way to go.
Here's the code I'm using...
var resultSet = [];
db.get('busstopcollection').find({id_bus_stop: parseInt(req.body.busstopid)}, function(e, docs){
if(e || docs.length === 0) {
console.log("Sorry, wrong id.");
return e;
}
for(var m=0; m<docs.length; m++){
var auxRes = {};
auxRes.id_bus = docs[m].id_bus;
auxRes.id_bus_stop = docs[m].id_bus_stop;
auxRes.coord_x = docs[m].coord_x;
auxRes.coord_y = docs[m].coord_y;
auxRes.id_bus_variation = docs[m].id_bus_variation;
db.get('buscollection').find({id_bus: parseInt(docs[m].id_bus)}, function(e, busDocs){
auxRes.s_origin_description = busDocs[0].s_origin_description;
auxRes.s_destination_description = busDocs[0].id_destination_description;
resultSet.push(auxRes);
});
res.send(JSON.stringify(resultSet));
}
});
I need to res.send the resultSet array after all the values have been added.
I've tried some other ways of doing this, but the thing is that when the res.send line is reached the second query hasn't finished at all. I also tried doing that inside the inner query's callback, but I need to check if it's the last in the for loop, and checking the value o m won't do it as it always is equivalent to docs.length.
As far as I know there's no such thing as a synchronous query in mongodb, but maybe I'm wrong.
What's the right way of doing this?
EDIT
I found a way around it, but I'm sure there's got to be a better way. Here's how I'm doing it...
db.get('busstopcollection').find({id_bus_stop: parseInt(req.body.busstopid)}, function(e, docs){
if(e || docs.length === 0) {
console.log("Ha ocurrido un error, no existe esa parada");
return e;
}
var busIDs = [];
for(var m=0; m<docs.length; m++){
busIDs.push(parseInt(docs[m].id_bus));
var auxRes = {};
auxRes.id_bus = docs[m].id_bus;
auxRes.id_bus_stop = docs[m].id_bus_stop;
auxRes.coord_x = docs[m].coord_x;
auxRes.coord_y = docs[m].coord_y;
auxRes.id_bus_variation = docs[m].id_bus_variation;
resultSet.push(auxRes);
}
db.get('buscollection').find({id_bus: {$in: busIDs}}, function(e, busDocs){
for(var n = 0; n<busDocs.length; n++){
for(var k=0; k<resultSet.length; k++){
if(resultSet[k].id_bus == busDocs[n].id_bus){
resultSet[k].s_origin_description = busDocs[n].s_origin_description;
resultSet[k].s_destination_description = busDocs[n].id_destination_description;
}
}
}
res.send(JSON.stringify(resultSet));
});
});
Node.js behavior is asynchronous , programmer has to code taking consideration of this behavior. Use callbacks or promises or a flow control library . In your your program , you
have put mongo query inside loop , which is a bad approach of querying . Instead if querying multiple times , use $in operator . It will optimize your code performance and
solves your response sending problem also.
var resultSet = [];
db.get('busstopcollection').find({id_bus_stop: parseInt(req.body.busstopid)}, function(e, docs){
if(e || docs.length === 0) {
console.log("Sorry, wrong id.");
return e;
}
var bus_ids = [];
for(var m=0; m<docs.length; m++){
var auxRes = {};
auxRes.id_bus = docs[m].id_bus;
bus_ids.push(parseInt(docs[m].id_bus)); // collect all ids
auxRes.id_bus_stop = docs[m].id_bus_stop;
auxRes.coord_x = docs[m].coord_x;
auxRes.coord_y = docs[m].coord_y;
auxRes.id_bus_variation = docs[m].id_bus_variation;
resultSet.push(auxRes);
}
// Query at one time for all document
db.get('buscollection').find({id_bus: {$in : bus_ids}}).toArray( function(e, busDocs){
// Now find and merge in one go
busDocs.forEach(function(eachBusDoc){
for(var i=0,len = resultSet.length;i< len;i++){
if(resultSet[i].id_bus == busDocs.id_bus ){
resultSet[i].s_origin_description = eachBusDoc.s_origin_description;
resultSet[i].s_destination_description = eachBusDoc.id_destination_description;
}
}
});
res.send(JSON.stringify(resultSet));
});
});
Your updated solution in your question is generally fine, as using $in is an excellent way of fetching a set of results (you'll want to make sure that you've indexed the id_bus property).
Here are a few tweaks (with a bit of cleanup and optimization):
db.get('busstopcollection')
.find({id_bus_stop: parseInt(req.body.busstopid)}).toArray(function(e, docs){
var auxById = {}; // store a dictionary of all the results for later
if(e || docs === null || docs.length === 0) {
console.log("Ha ocurrido un error, no existe esa parada");
return e;
}
var busIDs = [];
docs.forEach(function(doc) {
busIDs.push(parseInt(doc.id_bus));
// consider just using the doc directly rather than copying each property
// especially if you're not manipulating any of the data as it passes
var auxRes = {
id_bus : doc.id_bus,
id_bus_stop : doc.id_bus_stop,
coord_x : doc.coord_x,
coord_y : doc.coord_y,
id_bus_variation : doc.id_bus_variation
};
// you could just use each doc directly ...
// var auxRes = doc; ??
// ** importantly, store off the id_bus for each one so you can
// ** avoid a costly loop trying to match an id below.
auxById[doc.id_bus] = auxRes;
resultSet.push(auxRes);
});
// might want to consider using a cursor ... here's an example
db.get('buscollection')
.find({id_bus: {$in: busIDs}}).each(function(e, busDoc){
// the last item in the cursor will be null
if (busDoc === null) {
res.send(JSON.stringify(resultSet));
return;
}
var res = auxById[busDoc.id_bus];
if (res) { // did we find it in our dictionary of results?
// yes, we did == copy the necessary field data
res.s_origin_description = busDoc.s_origin_description;
res.s_destination_description = busDoc.id_destination_description;
}
});
});

Resources