I have a Node.js application that saves data to MongoDB.
Given one document, I want to find the most similar document in the database.
My idea is to implement some sort of nearest neighbour algorithm that takes all the records as a training sequence and returns the most similar document (including some sort of percentage on how similar these two documents are.)
E.g. having these records in my database...
{ name: "Bill", age: 10, pc: "Mac", ip: "68.23.13.8" }
{ name: "Alice", age: 22, pc: "Windows", ip: "193.186.11.3" }
{ name: "Bob", age: 12, pc: "Windows", ip: "56.89.22.1" }
...I want to find the closest document to this one
{ name: "Tom", age: 10, pc: "Mac", ip: "68.23.13.10" }
// algorithm returns "Bill", .76
Are there any Node modules/implementations that take any kind of objects/parameters and return their nearest neighbour?
Here is some example code. It assumes that you can run the search on every request. If you want to modify it, make sure that all similarity functions return a number between 0 and 1.
function tokenize(string) {
var tokens = [];
for (var i = 0; i < string.length-1; i++) {
tokens.push(string.substr(i,2));
}
return tokens.sort();
}
function intersect(a, b)
{
var ai=0, bi=0;
var result = new Array();
while( ai < a.length && bi < b.length )
{
if (a[ai] < b[bi] ){ ai++; }
else if (a[ai] > b[bi] ){ bi++; }
else /* they're equal */
{
result.push(a[ai]);
ai++;
bi++;
}
}
return result;
}
function sum(items) {
var sum = 0;
for (var i = 0; i < items.length; i++) {
sum += items[i];
}
return sum;
}
function wordSimilarity(a, b) {
var left = tokenize(a);
var right = tokenize(b);
var middle = intersect(left, right);
return (2*middle.length) / (left.length + right.length);
}
function ipSimilarity(a, b) {
var left = a.split('.');
var right = b.split('.');
var diffs = [];
for (var i = 0; i < 4; i++) {
var diff1 = 255-left[i];
var diff2 = 255-right[i];
var diff = Math.abs(diff2-diff1);
diffs[i] = diff;
}
var distance = sum(diffs)/(255*4);
return 1 - distance;
}
function ageSimilarity(a, b) {
var maxAge = 100;
var diff1 = maxAge-a;
var diff2 = maxAge-b;
var diff = Math.abs(diff2-diff1);
var distance = diff / maxAge;
return 1-distance;
}
function recordSimilarity(a, b) {
var fields = [
{name:'name', measure:wordSimilarity},
{name:'age', measure:ageSimilarity},
{name:'pc', measure:wordSimilarity},
{name:'ip', measure:ipSimilarity}
];
var sum = 0;
for (var i = 0; i < fields.length; i++) {
var field = fields[i];
var name = field.name;
var measure = field.measure;
var sim = measure(a[name], b[name]);
sum += sim;
}
return sum / fields.length;
}
function findMostSimilar(items, query) {
var maxSim = 0;
var result = null;
for (var i = 0; i < items.length; i++) {
var item = items[i];
var sim = recordSimilarity(item, query);
if (sim > maxSim) {
maxSim = sim;
result = item;
}
}
return result
}
var items = [
{ name: "Bill", age: 10, pc: "Mac", ip: "68.23.13.8" },
{ name: "Alice", age: 22, pc: "Windows", ip: "193.186.11.3" },
{ name: "Bob", age: 12, pc: "Windows", ip: "56.89.22.1" }
];
var query = { name: "Tom", age: 10, pc: "Mac", ip: "68.23.13.10" };
var result = findMostSimilar(items, query);
console.log(result);
A straightforward way of doing this would be to calculate a diff between the two documents and the larger the diff, the larger the distance. You could normalize the diff using the maximum possible diff which should give you relative distances that you can compare against each other.
Take a look at this question for calculating a diff on json documents.
Delta encoding for JSON objects
Related
I'm trying to make a comparison function. If the value that I search is not exist, the new value will be save to the database. But all I get is 0 new data found. So the system decides to not save the data. Is my searching wrong?
This is my code:
var count = 0;
for (var t = 1; t < 151; t++) {
var searching = JobsSchema.find({ jobName: name[t], company: company[t] })
if (searching == null) {
count = count + 1;
var newJobs = new JobsSchema({
"jobName": name[t],
"country": country[t],
"company": company[t],
"jobType": type[t],
"salary": salary[t],
"skills": skills[t],
"jobDesc": desc[t],
"jobReq": req[t],
"jobResponsibility": resp[t],
"industry": industry[t],
})
newJobs.save(function (err, result) {
if (err) {
console.log(err);
}
})
}
}
console.log(count + " new data found.");
You should await your find function.
Also, change it to findOne to return a single instance of the JobsSchema and await the save call as well.
Finally you will need to wrap the code into an async function:
const saveData = async () => {
var count = 0;
for (var t = 1; t < 151; t++) {
var searching = await JobsSchema.findOne({
jobName: name[t],
company: company[t],
});
if (!searching) {
count = count + 1;
var newJobs = new JobsSchema({
jobName: name[t],
country: country[t],
company: company[t],
jobType: type[t],
salary: salary[t],
skills: skills[t],
jobDesc: desc[t],
jobReq: req[t],
jobResponsibility: resp[t],
industry: industry[t],
});
await newJobs.save();
}
}
console.log(count + ' new data found.');
};
saveData();
I have JSON file contain games objects, I want to get top 5 games that have the highest total playtime between users.
I tried to get all objects by reading the file using file system in nodejs :
const queryGames = async () => {
let data = fs.readFileSync(path.resolve(__dirname, '../../games.json'))
let games = JSON.parse(data)
return games
}
/**
* Query for top games by play time
* #returns {Promise<QueryResult>}
*/
const selectTopByPlaytime = async () => {
}
this is the json file : https://jsoneditoronline.org/#left=cloud.3b82169327044c04b7207fa186aee85b&right=local.tiniqu
something like this should work.
const gamePlayData = require('./gamePlay.json').data
/**
* Query for games and time
* #returns {
'League of legends': 1650,
'World of warcraft': 2300,
'Dark Souls': 218,
'The Witcher 3: Wild Hunt': 987,
etc....
}
*/
const getGamePlayTimes = () => {
gamePlayTimes = {}
gamePlayData.forEach( (playData) => {
const gameName = playData.game
if(gamePlayTimes[gameName]) {
gamePlayTimes[gameName] += playData.playTime
}
else {
gamePlayTimes[gameName] = playData.playTime
}
})
return gamePlayTimes;
}
const getGamesAndTimesAsList = (playTimes) => {
let gamesWithTimeArr = [];
let i = 0;
for(let game in playTimes) {
let gameAndPlayTime = {game: "", playTime: 0};
gameAndPlayTime.game = game;
gameAndPlayTime.playTime = playTimes[game];
gamesWithTimeArr[i++] = gameAndPlayTime
}
return gamesWithTimeArr;
}
const reverseBubbleSort = (a, par) => {
let swapped;
do {
swapped = false;
for (var i = 0; i < a.length - 1; i++) {
if (a[i][par] < a[i + 1][par]) {
var temp = a[i];
a[i] = a[i + 1];
a[i + 1] = temp;
swapped = true;
}
}
} while (swapped);
return a;
}
sortedArr = reverseBubbleSort(getGamesAndTimesAsList( getGameAndPlayTimes() ) , 'playTime')
const top5 = sortedArr.slice(0, 5);
console.log(top5);
The code below only output the last result, I don't get it. I check if the updateDate item contains 2020-05 both items does and I get only the last one. The loop is not looping :)
const briefing = [
{
"updateDate": "2020-05-05T00:00:00.0Z",
},
{
"updateDate": "2020-05-06T00:00:00.0Z",
},
{
"updateDate": "2020-05-13T00:00:00.0Z",
}
];
let date = new Date();
var formattedYearMonth = date.getFullYear() + '-' + ('0' + (date.getMonth()+1)).slice(-2) + '-';
for (var i = 0; i < briefing.length; i++) {
var jsonDate = briefing[i].updateDate;
if (jsonDate.includes(formattedYearMonth)) {
var response = JSON.stringify(briefing[i]);
}
}return response;
}
for (var i = 0; i < briefing.length; i++) {
var jsonDate = briefing[i].updateDate;
if (jsonDate.includes(formattedYearMonth)) {
var response = JSON.stringify(briefing[i]); // <==== THIS IS WHERE YOUR PROBLEM LIES
}
}return response;
The loop is actually looping :). But for every run of the loop, you are resetting the value of response.
--EDITED--
For the response to be an array, you need to modify your code as
let response = [];
for (var i = 0; i < briefing.length; i++) {
var jsonDate = briefing[i].updateDate;
if (jsonDate.includes(formattedYearMonth)) {
response.push(JSON.stringify(briefing[i]));
}
}
return response;
Our company is planning on transitioning from REDIS to Aerospike, but we are seeing some strange issues with missing get requests (only 35% making it back to the callback function).
Here is the code we are testing with:
var cluster = require('cluster');
var numCPUs = require('os').cpus().length;
if (cluster.isMaster)
{
for (var i = 0; i < numCPUs; i++)
{
var worker = cluster.fork();
}
}
else
{
var start = new Date().getTime();
var requests = 0;
var responses = 0;
var aerospike = require('./node_modules/aerospike');
var status = aerospike.status;
var client = aerospike.client({
hosts: [
{ addr: '127.0.0.1', port: 3000 }
]
});
function connect_cb( err, client) {
if (err.code == status.AEROSPIKE_OK) {
console.log("Aerospike Connection Success")
}
}
client.connect(connect_cb)
setInterval(function(){
for(var i=0; i<50; i++)
{
var key = aerospike.key('dexi','toys','floor_'+i);
requests++;
client.get(key, function(err, rec, meta) {
responses++;
if ( err.code == status.AEROSPIKE_OK )
{
}
else
{
console.error('Get Error:', err);
}
});
}
},10);
setInterval(function(){
for(var i=0; i<50; i++)
{
var key = aerospike.key('dexi','toys','floor_'+i);
var rec = {
uid: 1000, // integer data stored in bin called "uid"
name: "user_name", // string data stored in bin called "user_name"
dob: { mm: 12, dd: 29, yy: 1995}, // map data stored (msgpack format) in bin called "dob"
friends: [1001, 1002, 1003]
};
var metadata = {
ttl: 10000,
gen: 0
};
client.put(key, rec, metadata, function(err) {
switch ( err.code ) {
case status.AEROSPIKE_OK:
break;
default:
console.error("Put Error: " + err.message);
exitCode = 1;
break;
}
});
}
},10);
setInterval(function(){
var timeSpent = ( new Date().getTime()) - start;
console.log(requests, responses,timeSpent);
},15000);
}
Below is the console output we are seeing:
34400 9306 15098
34150 9250 15080
35050 9330 15087
34150 9235 15092
33250 9310 15120
33950 9249 15090
34650 9298 15101
35000 9400 15102
34700 9300 15166
33150 9399 15181
34500 9300 15193
33850 9292 15207
34400 9250 15162
34100 9360 15212
34050 9250 15171
34100 9348 15159
33800 9250 15118
34300 9309 15189
34050 9300 15152
34250 9405 15181
As you can see, on average, for every 35k get requests we send, we are only seeing a small % of them actually come back. Our Aerospike dashboard also reflects the discrepancy (only seeing 35% of the gets sent), as the throughput is reflecting the responses we are getting back.
I have this document:
{
"_id": ObjectId("xxx"),
"props": {
"a": "a1",
"b": "b2"
}
}
My query looks like this:
db.collection.find({"$and": [ {"props.a" : "a1"}, {"props.b": "b2"} ]}
I get the elements of the query from GET values:
/api/search?a=a1&b=b1
So, I need a way to generate dinamically my query starting from the GET object...
I thought about something like this:
// Little helper to get the object key name by index
Object.prototype.key = function key(int) { var j = -1; for(var i in this) { j++; if(j==int) { return i; } else { continue; } } }
// My attempt
var query = [],
i = 0;
_.each(req.query, function(prop) {
var key = req.query.key(i);
query.push({"props." + key: prop});
i = i + 1;
});
But it does not work...
if I do:
_.each(req.query, function(prop) {
var key = {};
key.properties = {};
key.properties[req.query.key(i)] = prop ;
props.push(key);
i = i + 1;
});
I get this:
[{ props: { a: 'a1' } }, { props: { b: 'b1' } } ]
but in this way I could get only this query:
db.collection.find({"$and": [ { props: { a: 'a1' } }, { props: { b: 'b1' } } ]}
which is completely different from the one I've written above (this one search for a props which is exactly like the ones I've provided, instead the original looks for one which contains one of the values)
how can I do?
Ok I've found a solution, searching on google did not returned any solutions...
var query = {"$and": []},
props = [],
i = 0;
_.each(req.query, function(prop) {
var key = {};
key["properties." + req.query.key(i)] = prop ;
props.push(key);
i = i + 1;
});
query.$and = props;
In this way the query generated is exactly like I need it.