Mongo $min and $max, or Parallel sort - node.js

Hi I want to get the min and a max value of a field in my db.
I found this solution which queries and sorts the results:
get max value in mongoose
I could do this twice and combine it with async.parallel to write it non-blocking. But I guess two db queries may not be the best solution.
The second solution would be to use aggregate. But I don't want to group anything. I only want to use $match to filter (filter criteria are always diff and can be {}) and run the query with all documents in my collection.
http://docs.mongodb.org/manual/reference/operator/aggregation/min/
http://docs.mongodb.org/manual/reference/operator/aggregation/max/
Question)
Can I run this in one query with aggregate, maybe with $project
Is there another method than aggregate that works without
grouping
Will 1)/2) be more time efficient than the first solution with sorting?
EDIT:
Solved with the first solution, but I think there is a more efficient solution because this needs two database operations:
async.parallel
min: (next) ->
ImplantModel.findOne(newFilter).sort("serialNr").exec((err, result) ->
return next err if err?
return next null, 0 if !result?
next(null, result.serialNr)
)
max: (next) ->
ImplantModel.findOne(newFilter).sort("-serialNr").exec((err, result) ->
return next err if err?
return next null, 0 if !result?
next(null, result.serialNr)
)
(err, results) ->
console.log results.min, ' ', results.max
return callback(err) if err?
return callback null, {min:results.min, max:results.max}
)

Don't know what it is about this question, and sure to get no real love from the response but I just could not let it go and get to sleep without resolving.
So the first thing to say is I think I owe the OP here $10, because my expected results are not the case.
The basic idea presented here is a comparison of:
Using parallel execution of queries to find the "maximum" ( sorted total value ) af a field and also the minimum value by the same constraint
The aggregation framework $max and $min grouping accumulators over the whole collection.
In "theory" these two options are doing exactly the same thing. And in "theory" even though parallel execution can happen "over the wire" with simultaneous requests to the server, there still should be an "overhead" inherrent in those requests and the "aggregation" function in the client to bring both results together.
The tests here run a "series" execution of creating random data of a reasonable key length, the to be "fair" in comparison the "key" data here is also indexed.
The next "fairness" stage is to "warm up" the data, by doing a sequential "fetch" on all items, to simulate loading as much of the "working set" of data into memory as the client machine is capable.
Then we run each test, in comparison and series so as not to compete against eachover for resources, for either the "parallel query" case or the "aggregation" case to see the results with timers attached to the start and end of each excution.
Here is my testbed script, on the basic driver to keep thing as lean as possible ( nodejs environment considered ):
var async = require('async'),
mongodb = require('mongodb'),
MongoClient = mongodb.MongoClient;
var total = 1000000;
MongoClient.connect('mongodb://localhost/bigjunk',function(err,db) {
if (err) throw err;
var a = 10000000000000000000000;
db.collection('bigjunk',function(err,coll) {
if (err) throw err;
async.series(
[
// Clean data
function(callback) {
console.log("removing");
coll.remove({},callback);
},
// Insert data
function(callback) {
var count = 0,
bulk = coll.initializeUnorderedBulkOp();
async.whilst(
function() { return count < total },
function(callback) {
var randVal = Math.floor(Math.random(a)*a).toString(16);
//console.log(randVal);
bulk.insert({ "rand": randVal });
count++;
if ( count % 1000 == 0 ) {
if ( count % 10000 == 0 ) {
console.log("counter: %s",count); // log 10000
}
bulk.execute(function(err,res) {
bulk = coll.initializeUnorderedBulkOp();
callback();
});
} else {
callback();
}
},
callback
);
},
// index the collection
function(callback) {
console.log("indexing");
coll.createIndex({ "rand": 1 },callback);
},
// Warm up
function(callback) {
console.log("warming");
var cursor = coll.find();
cursor.on("error",function(err) {
callback(err);
});
cursor.on("data",function(data) {
// nuthin
});
cursor.on("end",function() {
callback();
});
},
/*
* *** The tests **
*/
// Parallel test
function(callback) {
console.log("parallel");
console.log(Date.now());
async.map(
[1,-1],
function(order,callback) {
coll.findOne({},{ "sort": { "rand": order } },callback);
},
function(err,result) {
console.log(Date.now());
if (err) callback(err);
console.log(result);
callback();
}
);
},
function(callback) {
console.log(Date.now());
coll.aggregate(
{ "$group": {
"_id": null,
"min": { "$min": "$rand" },
"max": { "$max": "$rand" }
}},
function(err,result) {
console.log(Date.now());
if (err) callback(err);
console.log(result);
callback();
}
);
}
],
function(err) {
if (err) throw err;
db.close();
}
);
});
});
And the results ( compared to what I expected ) are appauling in the "aggregate case".
For 10,000 documents:
1438964189731
1438964189737
[ { _id: 55c4d9dc57c520412399bde4, rand: '1000bf6bda089c00000' },
{ _id: 55c4d9dd57c520412399c731, rand: 'fff95e4662e6600000' } ]
1438964189741
1438964189773
[ { _id: null,
min: '1000bf6bda089c00000',
max: 'fff95e4662e6600000' } ]
Which indicates a difference of 6 ms for the parallel case, and a huge difference of 32ms for the aggregation case.
Can this get better? No:
For 100,000 documents:
1438965011402
1438965011407
[ { _id: 55c4dd036902125223a05958, rand: '10003bab87750d00000' },
{ _id: 55c4dd066902125223a0a84a, rand: 'fffe9714df72980000' } ]
1438965011411
1438965011640
[ { _id: null,
min: '10003bab87750d00000',
max: 'fffe9714df72980000' } ]
And the results still clearly show 5 ms which is close to the result of 10 times less the data and with the aggregation case this is 229 ms slower, nearly a factor of 10 ( the increased amount ) slower than the previous sample.
But wait, because it gets worse. Let's increase the sample to 1,000,000 entries:
1,000,000 document sample:
1438965648937
1438965648942
[ { _id: 55c4df7729cce9612303e39c, rand: '1000038ace6af800000' },
{ _id: 55c4df1029cce96123fa2195, rand: 'fffff7b34aa7300000' } ]
1438965648946
1438965651306
[ { _id: null,
min: '1000038ace6af800000',
max: 'fffff7b34aa7300000' } ]
This is actually the worst, becuase whilst the "parallel" case still continues to exhibit a 5ms response time, the "aggregation" case now blows out to a whopping 2360ms (wow, over 2 whole seconds). Which only has to be considered to be totally unacceptable as a differential from the alternate approach time. That is 500 times the execution cycle, and in computing terms that is huge.
Conclusions
Never make a bet on something unless you know a sure winner.
Aggregation "should" win here as the principles behind the results are basically the same as the "parallel excecution case" in the basic algorithm to pick the results from the keys of the index which is available.
This is a "fail" ( as my kids are fond of saying ) where the aggregation pipeline needs to be tought by someone ( my "semi-partner" is good at these things ) to go back to "algorithm school" and re-learn the basics that are being used by it's poorer cousin to producemuch faster results.
So the basic lesson here is:
We think the "aggregate" accumulators should be optimized to do this, but at present they clearly are not.
Of you want the fastest way to determine min/max on a collection of data ( without and distinct keys ) then a parallel query execution using the .sort() modfier is actually much faster than any alternative. ( with an index ).
So for people wanting to do this over a collection of data, use a parallel query as shown here. It's much faster ( until we can teach operators to be better :> )
I should note here that all timings are relative to hardware, and it is mainly the "comparison" of timings that is valid here.
These results are from my ( ancient ) laptop
Core I7 CPU (8x cores)
Windows 7 Host ( yes could not be bothered to re-install )
8GB RAM Host
4GB Allocated VM ( 4x core allocation )
VirtualBox 4.3.28
Ubuntu 15.04
MongoDB 3.1.6 (Devel)
And the latest "stable" node versions for packages as required in the listing here.

Related

Speed ​issue and also memory error when querying in Mongo

I have a table that contains over 100,000 records. Server: node.js/express.js. DB: mongo
On the client, a table with a pager is implemented. 10 records are requested each time.
When there were 10,000 records, of course, everything worked faster, but now there was a problem with speed and not only.
My aggregation:
import { concat } from 'lodash';
...
let query = [{$match: {}}];
query = concat(query, [{$sort : {createdAt: -1}}]);
query = concat(query, [
{$skip : (pageNum - 1) * perPage}, // 0
{$limit : perPage} // 10
]);
return User.aggregate(query)
.collation({locale: 'en', strength: 2})
.then((users) => ...;
2 cases:
first fetch very slow
when I click to last page I got error:
MongoError: Sort exceeded memory limit of 104857600 bytes, but did not opt in to external sorting. Aborting operation. Pass allowDiskUse:true to opt in.
Please, tell me, I am building the aggregation incorrectly, or is there a problem with memory on the server as the error says and additional nginx settings are needed (another person is engaged in this) or is the problem complex, or perhaps something else altogether?
Added:
I noticed that the index is not used when sorting, although it should be used?
aggregation to execute console.log =>
[
{
"$match": {}
},
{
"$lookup": {
...
}
},
{
"$project": {
...,
createdAt: 1,
...
}
},
{
"$match": {}
},
{
"$sort": {
"createdAt": -1
}
},
{
"$skip": 0
},
{
"$limit": 10
}
]
Thanks for any answers and sorry my English :)
It does say that you've memory limit, which makes sense, considering that you're trying to filter through 100,000 requests. I'd try using return User.aggregate(query, { allowDiskUse: true }) //etc, and see if that helps your issue.
Whilst this isn't the documentation on the Node.js driver specifically, this link summaries what the allowDiskUse option does (or in short, it allows MongoDB to go past the 100MB memory limit, and uses your system storage to temporarily store some information while it performs the query).

Multiple findOneAndUpdate operations are being skipped

I have a forEach loop where I am querying a document, doing some simple math calculations and then updating a document in the collection and move on to the next iteration.
The problem is, alot of times randomly some of the UPDATE operations will not update the document. I don't know why is it happening. Is it because of the lock?
I have tried logging things just before the update operation. The data is all correct but when it comes to update, it will randomly not update at all. Out of 10 iterations, lets say 8 will correctly work
const name = "foo_game";
players.forEach(({ id, team, username }) => {
let updatedStats = {};
Users.findOne({ id }).then(existingPlayer => {
if (!existingPlayer) return;
const { stats } = existingPlayer;
const existingStats = stats[pug.name];
if (!existingStats) return;
const presentWins = existingStats.won || 0;
const presentLosses = existingStats.lost || 0;
updatedStats = {
...existingStats,
won:
team === winningTeam
? presentWins + 1
: changeWinner
? presentWins - 1
: presentWins,
lost:
team !== winningTeam
? presentLosses + 1
: changeWinner
? presentLosses - 1
: presentLosses,
};
// THE CALCULATIONS ARE ALL CORRECT TILL THIS POINT
// THE UPDATE WIILL RANDOMLY NOT WORK
Users.findOneAndUpdate(
{ id, server_id: serverId },
{
$set: {
username,
stats: { ...stats, [name]: updatedStats },
},
},
{
upsert: true,
}
).exec();
});
});
Basically what you are missing here is the asynchronous operations of both the findOne() and the findOneAndUpdate() are not guaranteed to complete before your foreach() is completed. Using forEach() is not a great choice for a loop with async operations in it, but the other main point here is that it's completely unnecessary since MongoDB has a much better way of doing this and in one request to the server.
In short, instead of "looping" you actually want to provide an array of instructions to bulkWrite():
let server_id = serverId; // Alias one of your variables or just change it's name
Users.bulkWrite(
players.map(({ id, team, username }) =>
({
"updateOne": {
"filter": { _id, server_id },
"update": {
"$set": { username },
"$inc": {
[`stats.${name}.won`]:
team === winningTeam ? 1 : changeWinner ? - 1 : 0,
[`stats.${name}.lost`]:
team !== winningTeam ? 1 : changeWinner ? - 1 : 0
}
},
"upsert": true
}
})
)
)
.then(() => /* whatever continuation here */ )
.catch(e => console.error(e))
So rather than looping, that Array.map() produces one "updateOne" statement within the bulk operation for each array member and sends it to the server. The other change of course is you simply do not need the findOne() in order to read existing values. All you really need is to use the $inc operator in order to either increase or decrease the current value. Noting here that if nothing is currently recorded at the specified path, then it would create those with whatever value of 1/-1/0 was determined by the logic and handed to $inc.
Note this is how you actually should be doing things in general, as aside from avoiding uneccesary loops of async calls the main thing here is to actually use the atomic operators like $inc that MongoDB has. Reading data state from the database in order to make changes is an anti-pattern and best avoided.

How to order by twice with MongoDB, Mongoose, and NodeJS [duplicate]

I am looking to get a random record from a huge collection (100 million records).
What is the fastest and most efficient way to do so?
The data is already there and there are no field in which I can generate a random number and obtain a random row.
Starting with the 3.2 release of MongoDB, you can get N random docs from a collection using the $sample aggregation pipeline operator:
// Get one random document from the mycoll collection.
db.mycoll.aggregate([{ $sample: { size: 1 } }])
If you want to select the random document(s) from a filtered subset of the collection, prepend a $match stage to the pipeline:
// Get one random document matching {a: 10} from the mycoll collection.
db.mycoll.aggregate([
{ $match: { a: 10 } },
{ $sample: { size: 1 } }
])
As noted in the comments, when size is greater than 1, there may be duplicates in the returned document sample.
Do a count of all records, generate a random number between 0 and the count, and then do:
db.yourCollection.find().limit(-1).skip(yourRandomNumber).next()
Update for MongoDB 3.2
3.2 introduced $sample to the aggregation pipeline.
There's also a good blog post on putting it into practice.
For older versions (previous answer)
This was actually a feature request: http://jira.mongodb.org/browse/SERVER-533 but it was filed under "Won't fix."
The cookbook has a very good recipe to select a random document out of a collection: http://cookbook.mongodb.org/patterns/random-attribute/
To paraphrase the recipe, you assign random numbers to your documents:
db.docs.save( { key : 1, ..., random : Math.random() } )
Then select a random document:
rand = Math.random()
result = db.docs.findOne( { key : 2, random : { $gte : rand } } )
if ( result == null ) {
result = db.docs.findOne( { key : 2, random : { $lte : rand } } )
}
Querying with both $gte and $lte is necessary to find the document with a random number nearest rand.
And of course you'll want to index on the random field:
db.docs.ensureIndex( { key : 1, random :1 } )
If you're already querying against an index, simply drop it, append random: 1 to it, and add it again.
You can also use MongoDB's geospatial indexing feature to select the documents 'nearest' to a random number.
First, enable geospatial indexing on a collection:
db.docs.ensureIndex( { random_point: '2d' } )
To create a bunch of documents with random points on the X-axis:
for ( i = 0; i < 10; ++i ) {
db.docs.insert( { key: i, random_point: [Math.random(), 0] } );
}
Then you can get a random document from the collection like this:
db.docs.findOne( { random_point : { $near : [Math.random(), 0] } } )
Or you can retrieve several document nearest to a random point:
db.docs.find( { random_point : { $near : [Math.random(), 0] } } ).limit( 4 )
This requires only one query and no null checks, plus the code is clean, simple and flexible. You could even use the Y-axis of the geopoint to add a second randomness dimension to your query.
The following recipe is a little slower than the mongo cookbook solution (add a random key on every document), but returns more evenly distributed random documents. It's a little less-evenly distributed than the skip( random ) solution, but much faster and more fail-safe in case documents are removed.
function draw(collection, query) {
// query: mongodb query object (optional)
var query = query || { };
query['random'] = { $lte: Math.random() };
var cur = collection.find(query).sort({ rand: -1 });
if (! cur.hasNext()) {
delete query.random;
cur = collection.find(query).sort({ rand: -1 });
}
var doc = cur.next();
doc.random = Math.random();
collection.update({ _id: doc._id }, doc);
return doc;
}
It also requires you to add a random "random" field to your documents so don't forget to add this when you create them : you may need to initialize your collection as shown by Geoffrey
function addRandom(collection) {
collection.find().forEach(function (obj) {
obj.random = Math.random();
collection.save(obj);
});
}
db.eval(addRandom, db.things);
Benchmark results
This method is much faster than the skip() method (of ceejayoz) and generates more uniformly random documents than the "cookbook" method reported by Michael:
For a collection with 1,000,000 elements:
This method takes less than a millisecond on my machine
the skip() method takes 180 ms on average
The cookbook method will cause large numbers of documents to never get picked because their random number does not favor them.
This method will pick all elements evenly over time.
In my benchmark it was only 30% slower than the cookbook method.
the randomness is not 100% perfect but it is very good (and it can be improved if necessary)
This recipe is not perfect - the perfect solution would be a built-in feature as others have noted.
However it should be a good compromise for many purposes.
Here is a way using the default ObjectId values for _id and a little math and logic.
// Get the "min" and "max" timestamp values from the _id in the collection and the
// diff between.
// 4-bytes from a hex string is 8 characters
var min = parseInt(db.collection.find()
.sort({ "_id": 1 }).limit(1).toArray()[0]._id.str.substr(0,8),16)*1000,
max = parseInt(db.collection.find()
.sort({ "_id": -1 })limit(1).toArray()[0]._id.str.substr(0,8),16)*1000,
diff = max - min;
// Get a random value from diff and divide/multiply be 1000 for The "_id" precision:
var random = Math.floor(Math.floor(Math.random(diff)*diff)/1000)*1000;
// Use "random" in the range and pad the hex string to a valid ObjectId
var _id = new ObjectId(((min + random)/1000).toString(16) + "0000000000000000")
// Then query for the single document:
var randomDoc = db.collection.find({ "_id": { "$gte": _id } })
.sort({ "_id": 1 }).limit(1).toArray()[0];
That's the general logic in shell representation and easily adaptable.
So in points:
Find the min and max primary key values in the collection
Generate a random number that falls between the timestamps of those documents.
Add the random number to the minimum value and find the first document that is greater than or equal to that value.
This uses "padding" from the timestamp value in "hex" to form a valid ObjectId value since that is what we are looking for. Using integers as the _id value is essentially simplier but the same basic idea in the points.
Now you can use the aggregate.
Example:
db.users.aggregate(
[ { $sample: { size: 3 } } ]
)
See the doc.
In Python using pymongo:
import random
def get_random_doc():
count = collection.count()
return collection.find()[random.randrange(count)]
Using Python (pymongo), the aggregate function also works.
collection.aggregate([{'$sample': {'size': sample_size }}])
This approach is a lot faster than running a query for a random number (e.g. collection.find([random_int]). This is especially the case for large collections.
it is tough if there is no data there to key off of. what are the _id field? are they mongodb object id's? If so, you could get the highest and lowest values:
lowest = db.coll.find().sort({_id:1}).limit(1).next()._id;
highest = db.coll.find().sort({_id:-1}).limit(1).next()._id;
then if you assume the id's are uniformly distributed (but they aren't, but at least it's a start):
unsigned long long L = first_8_bytes_of(lowest)
unsigned long long H = first_8_bytes_of(highest)
V = (H - L) * random_from_0_to_1();
N = L + V;
oid = N concat random_4_bytes();
randomobj = db.coll.find({_id:{$gte:oid}}).limit(1);
You can pick a random timestamp and search for the first object that was created afterwards.
It will only scan a single document, though it doesn't necessarily give you a uniform distribution.
var randRec = function() {
// replace with your collection
var coll = db.collection
// get unixtime of first and last record
var min = coll.find().sort({_id: 1}).limit(1)[0]._id.getTimestamp() - 0;
var max = coll.find().sort({_id: -1}).limit(1)[0]._id.getTimestamp() - 0;
// allow to pass additional query params
return function(query) {
if (typeof query === 'undefined') query = {}
var randTime = Math.round(Math.random() * (max - min)) + min;
var hexSeconds = Math.floor(randTime / 1000).toString(16);
var id = ObjectId(hexSeconds + "0000000000000000");
query._id = {$gte: id}
return coll.find(query).limit(1)
};
}();
My solution on php:
/**
* Get random docs from Mongo
* #param $collection
* #param $where
* #param $fields
* #param $limit
* #author happy-code
* #url happy-code.com
*/
private function _mongodb_get_random (MongoCollection $collection, $where = array(), $fields = array(), $limit = false) {
// Total docs
$count = $collection->find($where, $fields)->count();
if (!$limit) {
// Get all docs
$limit = $count;
}
$data = array();
for( $i = 0; $i < $limit; $i++ ) {
// Skip documents
$skip = rand(0, ($count-1) );
if ($skip !== 0) {
$doc = $collection->find($where, $fields)->skip($skip)->limit(1)->getNext();
} else {
$doc = $collection->find($where, $fields)->limit(1)->getNext();
}
if (is_array($doc)) {
// Catch document
$data[ $doc['_id']->{'$id'} ] = $doc;
// Ignore current document when making the next iteration
$where['_id']['$nin'][] = $doc['_id'];
}
// Every iteration catch document and decrease in the total number of document
$count--;
}
return $data;
}
In order to get a determinated number of random docs without duplicates:
first get all ids
get size of documents
loop geting random index and skip duplicated
number_of_docs=7
db.collection('preguntas').find({},{_id:1}).toArray(function(err, arr) {
count=arr.length
idsram=[]
rans=[]
while(number_of_docs!=0){
var R = Math.floor(Math.random() * count);
if (rans.indexOf(R) > -1) {
continue
} else {
ans.push(R)
idsram.push(arr[R]._id)
number_of_docs--
}
}
db.collection('preguntas').find({}).toArray(function(err1, doc1) {
if (err1) { console.log(err1); return; }
res.send(doc1)
});
});
The best way in Mongoose is to make an aggregation call with $sample.
However, Mongoose does not apply Mongoose documents to Aggregation - especially not if populate() is to be applied as well.
For getting a "lean" array from the database:
/*
Sample model should be init first
const Sample = mongoose …
*/
const samples = await Sample.aggregate([
{ $match: {} },
{ $sample: { size: 33 } },
]).exec();
console.log(samples); //a lean Array
For getting an array of mongoose documents:
const samples = (
await Sample.aggregate([
{ $match: {} },
{ $sample: { size: 27 } },
{ $project: { _id: 1 } },
]).exec()
).map(v => v._id);
const mongooseSamples = await Sample.find({ _id: { $in: samples } });
console.log(mongooseSamples); //an Array of mongoose documents
I would suggest using map/reduce, where you use the map function to only emit when a random value is above a given probability.
function mapf() {
if(Math.random() <= probability) {
emit(1, this);
}
}
function reducef(key,values) {
return {"documents": values};
}
res = db.questions.mapReduce(mapf, reducef, {"out": {"inline": 1}, "scope": { "probability": 0.5}});
printjson(res.results);
The reducef function above works because only one key ('1') is emitted from the map function.
The value of the "probability" is defined in the "scope", when invoking mapRreduce(...)
Using mapReduce like this should also be usable on a sharded db.
If you want to select exactly n of m documents from the db, you could do it like this:
function mapf() {
if(countSubset == 0) return;
var prob = countSubset / countTotal;
if(Math.random() <= prob) {
emit(1, {"documents": [this]});
countSubset--;
}
countTotal--;
}
function reducef(key,values) {
var newArray = new Array();
for(var i=0; i < values.length; i++) {
newArray = newArray.concat(values[i].documents);
}
return {"documents": newArray};
}
res = db.questions.mapReduce(mapf, reducef, {"out": {"inline": 1}, "scope": {"countTotal": 4, "countSubset": 2}})
printjson(res.results);
Where "countTotal" (m) is the number of documents in the db, and "countSubset" (n) is the number of documents to retrieve.
This approach might give some problems on sharded databases.
You can pick random _id and return corresponding object:
db.collection.count( function(err, count){
db.collection.distinct( "_id" , function( err, result) {
if (err)
res.send(err)
var randomId = result[Math.floor(Math.random() * (count-1))]
db.collection.findOne( { _id: randomId } , function( err, result) {
if (err)
res.send(err)
console.log(result)
})
})
})
Here you dont need to spend space on storing random numbers in collection.
The following aggregation operation randomly selects 3 documents from the collection:
db.users.aggregate(
[ { $sample: { size: 3 } } ]
)
https://docs.mongodb.com/manual/reference/operator/aggregation/sample/
MongoDB now has $rand
To pick n non repeat items, aggregate with { $addFields: { _f: { $rand: {} } } } then $sort by _f and $limit n.
I'd suggest adding a random int field to each object. Then you can just do a
findOne({random_field: {$gte: rand()}})
to pick a random document. Just make sure you ensureIndex({random_field:1})
When I was faced with a similar solution, I backtracked and found that the business request was actually for creating some form of rotation of the inventory being presented. In that case, there are much better options, which have answers from search engines like Solr, not data stores like MongoDB.
In short, with the requirement to "intelligently rotate" content, what we should do instead of a random number across all of the documents is to include a personal q score modifier. To implement this yourself, assuming a small population of users, you can store a document per user that has the productId, impression count, click-through count, last seen date, and whatever other factors the business finds as being meaningful to compute a q score modifier. When retrieving the set to display, typically you request more documents from the data store than requested by the end user, then apply the q score modifier, take the number of records requested by the end user, then randomize the page of results, a tiny set, so simply sort the documents in the application layer (in memory).
If the universe of users is too large, you can categorize users into behavior groups and index by behavior group rather than user.
If the universe of products is small enough, you can create an index per user.
I have found this technique to be much more efficient, but more importantly more effective in creating a relevant, worthwhile experience of using the software solution.
non of the solutions worked well for me. especially when there are many gaps and set is small.
this worked very well for me(in php):
$count = $collection->count($search);
$skip = mt_rand(0, $count - 1);
$result = $collection->find($search)->skip($skip)->limit(1)->getNext();
My PHP/MongoDB sort/order by RANDOM solution. Hope this helps anyone.
Note: I have numeric ID's within my MongoDB collection that refer to a MySQL database record.
First I create an array with 10 randomly generated numbers
$randomNumbers = [];
for($i = 0; $i < 10; $i++){
$randomNumbers[] = rand(0,1000);
}
In my aggregation I use the $addField pipeline operator combined with $arrayElemAt and $mod (modulus). The modulus operator will give me a number from 0 - 9 which I then use to pick a number from the array with random generated numbers.
$aggregate[] = [
'$addFields' => [
'random_sort' => [ '$arrayElemAt' => [ $randomNumbers, [ '$mod' => [ '$my_numeric_mysql_id', 10 ] ] ] ],
],
];
After that you can use the sort Pipeline.
$aggregate[] = [
'$sort' => [
'random_sort' => 1
]
];
My simplest solution to this ...
db.coll.find()
.limit(1)
.skip(Math.floor(Math.random() * 500))
.next()
Where you have at least 500 items on collections
If you have a simple id key, you could store all the id's in an array, and then pick a random id. (Ruby answer):
ids = #coll.find({},fields:{_id:1}).to_a
#coll.find(ids.sample).first
Using Map/Reduce, you can certainly get a random record, just not necessarily very efficiently depending on the size of the resulting filtered collection you end up working with.
I've tested this method with 50,000 documents (the filter reduces it to about 30,000), and it executes in approximately 400ms on an Intel i3 with 16GB ram and a SATA3 HDD...
db.toc_content.mapReduce(
/* map function */
function() { emit( 1, this._id ); },
/* reduce function */
function(k,v) {
var r = Math.floor((Math.random()*v.length));
return v[r];
},
/* options */
{
out: { inline: 1 },
/* Filter the collection to "A"ctive documents */
query: { status: "A" }
}
);
The Map function simply creates an array of the id's of all documents that match the query. In my case I tested this with approximately 30,000 out of the 50,000 possible documents.
The Reduce function simply picks a random integer between 0 and the number of items (-1) in the array, and then returns that _id from the array.
400ms sounds like a long time, and it really is, if you had fifty million records instead of fifty thousand, this may increase the overhead to the point where it becomes unusable in multi-user situations.
There is an open issue for MongoDB to include this feature in the core... https://jira.mongodb.org/browse/SERVER-533
If this "random" selection was built into an index-lookup instead of collecting ids into an array and then selecting one, this would help incredibly. (go vote it up!)
This works nice, it's fast, works with multiple documents and doesn't require populating rand field, which will eventually populate itself:
add index to .rand field on your collection
use find and refresh, something like:
// Install packages:
// npm install mongodb async
// Add index in mongo:
// db.ensureIndex('mycollection', { rand: 1 })
var mongodb = require('mongodb')
var async = require('async')
// Find n random documents by using "rand" field.
function findAndRefreshRand (collection, n, fields, done) {
var result = []
var rand = Math.random()
// Append documents to the result based on criteria and options, if options.limit is 0 skip the call.
var appender = function (criteria, options, done) {
return function (done) {
if (options.limit > 0) {
collection.find(criteria, fields, options).toArray(
function (err, docs) {
if (!err && Array.isArray(docs)) {
Array.prototype.push.apply(result, docs)
}
done(err)
}
)
} else {
async.nextTick(done)
}
}
}
async.series([
// Fetch docs with unitialized .rand.
// NOTE: You can comment out this step if all docs have initialized .rand = Math.random()
appender({ rand: { $exists: false } }, { limit: n - result.length }),
// Fetch on one side of random number.
appender({ rand: { $gte: rand } }, { sort: { rand: 1 }, limit: n - result.length }),
// Continue fetch on the other side.
appender({ rand: { $lt: rand } }, { sort: { rand: -1 }, limit: n - result.length }),
// Refresh fetched docs, if any.
function (done) {
if (result.length > 0) {
var batch = collection.initializeUnorderedBulkOp({ w: 0 })
for (var i = 0; i < result.length; ++i) {
batch.find({ _id: result[i]._id }).updateOne({ rand: Math.random() })
}
batch.execute(done)
} else {
async.nextTick(done)
}
}
], function (err) {
done(err, result)
})
}
// Example usage
mongodb.MongoClient.connect('mongodb://localhost:27017/core-development', function (err, db) {
if (!err) {
findAndRefreshRand(db.collection('profiles'), 1024, { _id: true, rand: true }, function (err, result) {
if (!err) {
console.log(result)
} else {
console.error(err)
}
db.close()
})
} else {
console.error(err)
}
})
ps. How to find random records in mongodb question is marked as duplicate of this question. The difference is that this question asks explicitly about single record as the other one explicitly about getting random documents.
For me, I wanted to get the same records, in a random order, so I created an empty array used to sort, then generated random numbers between one and 7( I have seven fields). So each time I get a different value, I assign a different random sort.
It is 'layman' but it worked for me.
//generate random number
const randomval = some random value;
//declare sort array and initialize to empty
const sort = [];
//write a conditional if else to get to decide which sort to use
if(randomval == 1)
{
sort.push(...['createdAt',1]);
}
else if(randomval == 2)
{
sort.push(...['_id',1]);
}
....
else if(randomval == n)
{
sort.push(...['n',1]);
}
If you're using mongoid, the document-to-object wrapper, you can do the following in
Ruby. (Assuming your model is User)
User.all.to_a[rand(User.count)]
In my .irbrc, I have
def rando klass
klass.all.to_a[rand(klass.count)]
end
so in rails console, I can do, for example,
rando User
rando Article
to get documents randomly from any collection.
you can also use shuffle-array after executing your query
var shuffle = require('shuffle-array');
Accounts.find(qry,function(err,results_array){
newIndexArr=shuffle(results_array);
What works efficiently and reliably is this:
Add a field called "random" to each document and assign a random value to it, add an index for the random field and proceed as follows:
Let's assume we have a collection of web links called "links" and we want a random link from it:
link = db.links.find().sort({random: 1}).limit(1)[0]
To ensure the same link won't pop up a second time, update its random field with a new random number:
db.links.update({random: Math.random()}, link)

MongoDB update/insert document and Increment the matched array element

I use Node.js and MongoDB with monk.js and i want to do the logging in a minimal way with one document per hour like:
final doc:
{ time: YYYY-MM-DD-HH, log: [ {action: action1, count: 1 }, {action: action2, count: 27 }, {action: action3, count: 5 } ] }
the complete document should be created by incrementing one value.
e.g someone visits a webpage first this hour and the incrementation of action1 should create the following document with a query:
{ time: YYYY-MM-DD-HH, log: [ {action: action1, count: 1} ] }
an other user in this hour visits an other webpage and document should be exteded to:
{ time: YYYY-MM-DD-HH, log: [ {action: action1, count: 1}, {action: action2, count: 1} ] }
and the values in count should be incremented on visiting the different webpages.
At the moment i create vor each action a doc:
tracking.update({
time: moment().format('YYYY-MM-DD_HH'),
action: action,
info: info
}, { $inc: {count: 1} }, { upsert: true }, function (err){}
Is this possible with monk.js / mongodb?
EDIT:
Thank you. Your solution looks clean and elegant, but it looks like my server can't handle it, or i am to nooby to make it work.
i wrote a extremly dirty solution with the action-name as key:
tracking.update({ time: time, ts: ts}, JSON.parse('{ "$inc":
{"'+action+'": 1}}') , { upsert: true }, function (err) {});
Yes it is very possible and a well considered question. The only variation I would make on the approach is to rather calculate the "time" value as a real Date object ( Quite useful in MongoDB, and manipulative as well ) but simply "round" the values with basic date math. You could use "moment.js" for the same result, but I find the math simple.
The other main consideration here is that mixing array "push" actions with possible "updsert" document actions can be a real problem, so it is best to handle this with "multiple" update statements, where only the condition you want is going to change anything.
The best way to do that, is with MongoDB Bulk Operations.
Consider that your data comes in something like this:
{ "timestamp": 1439381722531, "action": "action1" }
Where the "timestamp" is an epoch timestamp value acurate to the millisecond. So the handling of this looks like:
// Just adding for the listing, assuming already defined otherwise
var payload = { "timestamp": 1439381722531, "action": "action1" };
// Round to hour
var hour = new Date(
payload.timestamp - ( payload.timestamp % ( 1000 * 60 * 60 ) )
);
// Init transaction
var bulk = db.collection.initializeOrderedBulkOp();
// Try to increment where array element exists in document
bulk.find({
"time": hour,
"log.action": payload.action
}).updateOne({
"$inc": { "log.$.count": 1 }
});
// Try to upsert where document does not exist
bulk.find({ "time": hour }).upsert().updateOne({
"$setOnInsert": {
"log": [{ "action": payload.action, "count": 1 }]
}
});
// Try to "push" where array element does not exist in matched document
bulk.find({
"time": hour,
"log.action": { "$ne": payload.action }
}).updateOne({
"$push": { "log": { "action": payload.action, "count": 1 } }
});
bulk.execute();
So if you look through the logic there, then you will see that it is only ever possible for "one" of those statements to be true for any given state of the document either existing or not. Technically speaking, the statment with the "upsert" can actually match a document when it exists, however the $setOnInsert operation used makes sure that no changes are made, unless the action actually "inserts" a new document.
Since all operations are fired in "Bulk", then the only time the server is contacted is on the .execute() call. So there is only "one" request to the server and only "one" response, despite the multiple operations. It is actually "one" request.
In this way the conditions are all met:
Create a new document for the current period where one does not exist and insert initial data to the array.
Add a new item to the array where the current "action" classification does not exist and add an initial count.
Increment the count property of the specified action within the array upon execution of the statement.
All in all, yes posssible, and also a great idea for storage as long as the action classifications do not grow too large within a period ( 500 array elements should be used as a maximum guide ) and the updating is very efficient and self contained within a single document for each time sample.
The structure is also nice and well suited to other query and possible addtional aggregation purposes as well.

Node, MongoDB (mongoose) distinct count

I have a collection with multiple documents and every one of them has and 'eID' field that is not unique. I want to get the count for all the distinct 'eID'.
Example: if there are 5 documents with the 'eID' = ObjectID(123) and 2 documents with 'eID' = ObjectID(321) I want to output something like:
{
ObjectID(123): 5,
ObjectID(321): 2
}
I don't know if that can be done in the same query but after knowing what are the most ocurring eID's I want to fetch the referenced documents using the ObjectID
Mongoose version 3.8.8
$status is the specific field of collection that i need to count distinct number of element.
var agg = [
{$group: {
_id: "$status",
total: {$sum: 1}
}}
];
model.Site.aggregate(agg, function(err, logs){
if (err) { return res.json(err); }
return res.json(logs);
});
//output
[
{
"_id": "plan",
"total": 3
},
{
"_id": "complete",
"total": 4
},
{
"_id": "hault",
"total": 2
},
{
"_id": "incomplete",
"total": 4
}
]
This answer is not in terms of how this query can be written via mongoose, but I am familiar with the nodejs MongoClient class if you have further questions regarding implementation.
The best (most optimal) way I can think of doing this is to use mapReduce or aggregation on your database. The closest thing to a single command would be the distinct command, which can be invoked on collections, but this will only give you an array of distinct values for the eID key.
See here: http://docs.mongodb.org/manual/core/map-reduce/
For your specific problem, you will want your map and reduce functions roughly as follows:
var map = function() {
var value = 1;
emit(this.eID, value);
};
var reduce = function(key, values) {
var result = 0;
for(var i=-1;++i<values.length;){
var value = values[i];
result += value;
};
return result;
};
There might be an easier way to do this using the aggregation pipeline (I would post the link but I don't have enough reputation).
I also found the mapReduce command for mongoose: http://mongoosejs.com/docs/api.html#model_Model.mapReduce

Resources