save many records to couchdb in nodejs - node.js

I have a very large dataset that I want to save in couchdb for searchability.
I want the records to look like this:
{
"type": "first",
"name": "ryan",
"count": 447980
}
Since the text-files are larger than I should hold in memory, I am setting up a streaming readline reader, like so:
var db = require('./db'),
readline = require('readline'),
path = require('path'),
fs = require('fs');
// simple callback after cradle save
function saveHandler(er, doc){
if (er) return console.log('Error: ', er);
console.log(doc);
}
// save record of type, based on line with count & name
function handleCountedLine(type, line){
return function(line){
var record = {type:type};
var i = line.trim().split(' ');
record.name = i[1].trim();
record.count = Number(i[0]);
db.save(record, saveHandler);
}
}
var handleFirst = handleCountedLine('first');
readline.createInterface({
input: fs.createReadStream('data/facebook-firstnames-withcount.txt'),
terminal: false
})
.on('line', handleFirst);
db is a cradle db.
After 40 records or so, it slows to a total crawl, then eventually runs out of memory. I tried poolr and node-rate-limiter, using "only run this many at a time" & "only allow this many to run in a minute" strategies. Both work a little better, but it still runs out of memory. Is there a good way to accomplish this goal, or am I stuck writing it in python?

With awesome help from Paulo Machado in google hangouts, I made an answer using line-by-line, a simple wrapper that uses stream.pause() & stream.resume() to only allow a single line to be processed at a time. I'd like to give him the credit, but he hasn't come over here to make an answer, so I will just put this here. It has parsed 34039 records, so far. I will update the answer if it crashes.
var LineByLineReader = require('line-by-line'),
path = require('path'),
db = require('./db')
// line-by-line read file, turn into a couch record
function processFile(type){
var fname = path.join('data', types[type] + '.txt');
var lr = new LineByLineReader(fname, {skipEmptyLines: true});
lr.on('error', function (err) {
console.log('Error:');
console.log(err);
});
lr.on('record', function (record) {
console.log('Saved:');
console.log(record);
});
lr.on('line', function (line) {
lr.pause();
var record = { type: type };
if (type == 'full'){
record.name = line.trim().split(' ');
}else{
var i = line.trim().split(' ');
record.name = i[1].trim();
record.count = Number(i[0]);
}
db.save(record, function(er, res){
if (er) lr.emit('error', er, record);
if (res) lr.emit('record', record);
lr.resume();
})
});
}
var types = {
'first':'facebook-firstnames-withcount',
'last':'facebook-lastnames-withcount',
'full':'facebook-names-unique'
};
for (type in types){
processFile(type);
}
// views for looking things up
db.save('_design/views', require('./views'));

I guess couchdb is the bottleneck here. Have a look at couchdb's bulk doc api that allows you to insert documents en masse. (You should probably not try to commit all your data at once, but accumulate a bunch of docs in an array and push that to the database -- use stream.pause() and stream.resume() to throttle the text stream). You will be rewarded with efficiency gains by couchdb if you use the bulk api.

Related

cant get variable value in node js

I tried to make the function async but when I print the attacks it prints out {} without anything in it but when I print the values right after adding them in attacks I can print them why is it like that? how can I use the value?
var fs = require('fs');
var http = require('http');
var attacks = {};
var phase_name;
var directory = 'cti-master\\enterprise-attack\\attack-pattern\\';
// getting all files names.
async function getData(directory){
fs.readdir(directory, (err, files) => {
if(err) { return;}
var fileNum = 0;
// opening all the files and sorting the data in them.
while (fileNum < files.length - 1)
{
fs.readFile(directory + files[fileNum], 'utf8', (err, data) =>
{
// parsing the data from json.
var fileData = JSON.parse(data);
// sometimes there is no phase name.
if(fileData['objects'][0]['kill_chain_phases'] == undefined){phase_name = undefined;}
else{phase_name = fileData['objects'][0]['kill_chain_phases'][0]['phase_name'];}
// sorting data by name to make it easier later.
attacks[fileData['objects'][0]['name']] = {
id: fileData['objects'][0]['id'],
type: fileData['objects'][0]['type'],
description: fileData['objects'][0]['description'],
x_mitre_platforms: fileData['objects'][0]['x_mitre_platforms'],
x_mitre_detection: fileData['objects'][0]['x_mitre_detection'],
phase_name: phase_name};
});
fileNum += 1;
};
});
var keys = Object.keys(attacks);
console.log(attacks);
}
getData(directory);
The reason for the empty log here because the node does not wait to finish while loop Hence you are getting empty log. Basically, you can improve this code by using the async-await method.
But if you are stick with this code, I am just suggesting this logic.
Just bring your log inside an if condition block. which have condition "print only if expected file count reached"
for example.
if(fileNum === files.length) {
var keys = Object.keys(attacks);
console.log(attacks);
}
So now log print only when this condition is satisfied which means after completion of while loop

Correct way to organise this process in Node

I need some advice on how to structure this function as at the moment it is not happening in the correct order due to node being asynchronous.
This is the flow I want to achieve; I don't need help with the code itself but with the order to achieve the end results and any suggestions on how to make it efficient
Node routes a GET request to my controller.
Controller reads a .csv file on local system and opens a read stream using fs module
Then use csv-parse module to convert that to an array line by line (many 100,000's of lines)
Start a try/catch block
With the current row from the csv, take a value and try to find it in a MongoDB
If found, take the ID and store the line from the CSV and this id as a foreign ID in a separate database
If not found, create an entry into the DB and take the new ID and then do 6.
Print out to terminal the row number being worked on (ideally at some point I would like to be able to send this value to the page and have it update like a progress bar as the rows are completed)
Here is a small part of the code structure that I am currently using;
const fs = require('fs');
const parse = require('csv-parse');
function addDataOne(req, id) {
const modelOneInstance = new InstanceOne({ ...code });
const resultOne = modelOneInstance.save();
return resultOne;
}
function addDataTwo(req, id) {
const modelTwoInstance = new InstanceTwo({ ...code });
const resultTwo = modelTwoInstance.save();
return resultTwo;
}
exports.add_data = (req, res) => {
const fileSys = 'public/data/';
const parsedData = [];
let i = 0;
fs.createReadStream(`${fileSys}${req.query.file}`)
.pipe(parse({}))
.on('data', (dataRow) => {
let RowObj = {
one: dataRow[0],
two: dataRow[1],
three: dataRow[2],
etc,
etc
};
try {
ModelOne.find(
{ propertyone: RowObj.one, propertytwo: RowObj.two },
'_id, foreign_id'
).exec((err, searchProp) => {
if (err) {
console.log(err);
} else {
if (searchProp.length > 1) {
console.log('too many returned from find function');
}
if (searchProp.length === 1) {
addDataOne(RowObj, searchProp[0]).then((result) => {
searchProp[0].foreign_id.push(result._id);
searchProp[0].save();
});
}
if (searchProp.length === 0) {
let resultAddProp = null;
addDataTwo(RowObj).then((result) => {
resultAddProp = result;
addDataOne(req, resultAddProp._id).then((result) => {
resultAddProp.foreign_id.push(result._id);
resultAddProp.save();
});
});
}
}
});
} catch (error) {
console.log(error);
}
i++;
let iString = i.toString();
process.stdout.clearLine();
process.stdout.cursorTo(0);
process.stdout.write(iString);
})
.on('end', () => {
res.send('added');
});
};
I have tried to make the functions use async/await but it seems to conflict with the fs.openReadStream or csv parse functionality, probably due to my inexperience and lack of correct use of code...
I appreciate that this is a long question about the fundamentals of the code but just some tips/advice/pointers on how to get this going would be appreciated. I had it working when the data was sent one at a time via a post request from postman but can't implement the next stage which is to read from the csv file which contains many records
First of all you can make the following checks into one query:
if (searchProp.length === 1) {
if (searchProp.length === 0) {
Use upsert option in mongodb findOneAndUpdate query to update or upsert.
Secondly don't do this in main thread. Use a queue mechanism it will be much more efficient.
Queue which I personally use is Bull Queue.
https://github.com/OptimalBits/bull#basic-usage
This also provides the functionality you need of showing progress.
Also regarding using Async Await with ReadStream, a lot of example can be found on net such as : https://humanwhocodes.com/snippets/2019/05/nodejs-read-stream-promise/

Using Redis SCAN in NODE

I have Redis with a lot of keys in some format and I want to get keys that match some pattern and do some operations on them. I don't use KEYS method since it's not recommend in production. Using SCAN I'm wondering what is the best way to write it in code. I have to do something like a while loop but using promises, my current solution looks like this (code is simplified a little):
'use strict'
const Promise = require('bluebird');
const config = require('./config');
const client = require('./clinet');
let iterator = 0;
Promise.coroutine(function* () {
do {
iterator = yield clinet.scanAsync(iterator, 'myQuery', 'COUNT', config.scanChunkSize)
.then(data => {
let nextIterator = data[0];
let values = data[1];
//do some magic with values
return nextIterator;
})
} while (iterator !== '0');
})();
Is there a better way to do it that I'm missing?
I realize this is a really old question, but I found all of the other answers very unsatisfying. Here is yet another attempt to scan in a relatively clean way using async await (WITHOUT the use of yet another external dependency). You can easily modify this to continuously delete each set of found keys (you would want to tackle them in batches like this in case there are LOTS). Pushing them into an array just demonstrates one very basic thing you could do with them during this stage.
const redis = require('redis');
const { promisify } = require('util');
const client = redis.createClient({...opts});
const scan = promisify(client.scan).bind(client);
const scanAll = async (pattern) => {
const found = [];
let cursor = '0';
do {
const reply = await scan(cursor, 'MATCH', pattern);
cursor = reply[0];
found.push(...reply[1]);
} while (cursor !== '0');
return found;
}
You can use recursion to keep calling scan until done.
function scanAsync(cursor, pattern, returnSet){
return redisClient.scanAsync(cursor, "MATCH", pattern, "COUNT", "100").then(
function (reply) {
cursor = reply[0];
var keys = reply[1];
keys.forEach(function(key,i){
returnSet.add(key);
});
if( cursor === '0' ){
return Array.from(returnSet);
}else{
return scanAsync(cursor, pattern, returnSet)
}
});
}
Pass in a Set() to make sure keys aren't duplicated
myResults = new Set();
scanAsync('0', "NOC-*[^listen]*", myResults).map(
function( myResults ){ console.log( myResults); }
);
You can try this snippet to scan (1000) keys per iteration and 'delete`.
var cursor = '0';
function scan(pattern,callback){
redisClient.scan(cursor, 'MATCH',pattern,'COUNT', '1000', function(err, reply){
if(err){
throw err;
}
cursor = reply[0];
if(cursor === '0'){
return callback();
}else{
var keys = reply[1];
keys.forEach(function(key,i){
redisClient.del(key, function(deleteErr, deleteSuccess){
console.log(key);
});
});
return scan(pattern,callback);
}
});
}
scan(strkey,function(){
console.log('Scan Complete');
});
Nice option for node-redis module is to use scan iterators. Example:
const redis = require("redis");
const client = redis.createClient();
async function getKeys(pattern="*", count=10) {
const results = [];
const iteratorParams = {
MATCH: pattern,
COUNT: count
}
for await (const key of client.scanIterator(iteratorParams)) {
results.push(key);
}
return results;
}
(Of course you can also process your keys on the fly in for await loop without storing them in additional array if that's enough for you).
If you do not want to override scan parameters (MATCH/COUNT) you can just skip them and execute client.scanIterator() without parameter (defaults will be used then, MATCH="*", COUNT=10).
I think the node bindings for Redis are pushing too much responsibility to the caller here. So I created my own library for scanning as well, using generators in node:
const redis = require('redis')
const client = redis.createClient(…)
const generators = require('redis-async-gen')
const { keysMatching } = generators.using(client)
…
for await (const key of keysMatching('test*')) {
console.info(key)
}
It's the last bit that obviously is the thing that you should care about. Instead of having to carefully control an iterator yourself, all you need to do is use a for comprehension.
I wrote more about it here.
Go through this, it may help.
https://github.com/fritzy/node-redisscan
do not use the library as it, go through the code available at
https://github.com/fritzy/node-redisscan/blob/master/index.js

Nodejs, not waiting for Redis query to complete before continuing with execution

Using Node.js I need to load three files dynamically with a require() function by fetching the file path from Cassandra. From each file I need to fetch data that is in Redis and do some validation before loading another file from Cassandra. The issue here is: before the validation logic executes and provides results the next file's start to get loaded in parallel. The validation result comes after the loading of the second file, which shouldn't happen. The second file loading should wait for the first file validation logic to be complete and must load only if the validation result is a success. Please help me ... How do I pause or wait for Redis to complete the query in node.js???
node.js
"use strict";
var express = require('express');
var cassandra = require('cassandra-driver');
var app = express();
var Promise = require('bluebird');
var redis = Promise.promisifyAll(require('redis'));
var redisClient = redis.createClient(6379, '127.0.0.1');
var client = new cassandra.Client({contactPoints: ['127.0.0.1'], keyspace: 'poc'});
client.execute("SELECT file FROM testqry1", function (err, result) {
if (!err){
if ( result.rows.length > 0 ) {
for(var i=0; i< result.rows.length; i++){
var filePath=result.rows[i].get('file');
var newdat=Promise.promisifyAll(require(filePath));
var res = newdat(redisClient);
console.log('res:::'+res);
if (res=='failed'){
return;
}
}
} else {
console.log("No results");
}
}
});
file1.js
var crypto = require('crypto');
var redisValue='';
module.exports = function(redisclient){
redisclient.hmgetAsync("testdata", "text1").then(function(redisValue){
console.log('value from redis::'+redisValue)
}).then(function(){
var hashedUserID = crypto.createHmac('sha256', 'sample')
.update('helloworld')
.digest('hex');
function disp(value) {
console.log('value::'+value);
}
disp(hashedUserID);
console.log('redisValue::'+redisValue);
if(hashedUserID =='e043e7e68058c8a4cd686db38f01771bd7a04b8bb9a658d3cb40d0be45935094'){
redata='true';
}else{
redata='false';
}
console.log('redata::'+redata)
})
}
file2.js & file3.js as same content
var result1='';
module.exports = function(redisclient){
redisclient.hmget("testdata", "text1" , function(err, redisValue){
console.log('redisValue2 == %s',redisValue);
if(redisValue == 'test value'){
result1 = "success";
}else{
result1="failed";
}
});
return result1;
}
Output :
res:::undefined
res:::
res:::
value from redis::test data here
value::e043e7e68058c8a4cd686db38f01771bd7a04b8bb9a658d3cb40d0be45935094
redisValue::
redata::true
redisValue2 == test data here
redisValue3 == hello world test data
You say that file2/3 are "same content" but they aren't in one critical area. Per Bluebird's documentation for promisifyAll (see http://bluebirdjs.com/docs/api/promise.promisifyall.html), this feature creates an ...Async version of each core function in the Redis client. You call hmgetAsync in your first case, but you only call hmget in your others.
This is important because you're using an async pattern but with a non-async code structure. In file2/3 you set result1 inside an async callback, but then return it below each call before the call could possibly have returned.
You have two choices:
1: You can convert file2/3/etc to a fully traditional pattern by passing in a callback in addition to the redis client:
module.exports = function(redisclient, callback){
Instead of returning result1, you would then call the callback with this value:
if(redisValue == 'test value'){
callback(null, "success");
} else {
callback("failed", null);
}
2: You could convert file2/3/..N to be Promise-based, in which case you do not need to promisifyAll(require(...)) them - you can simply require() them. Such a pattern might look like:
module.exports = function(redisclient){
return redisclient.hmgetAsync("testdata", "text1");
};
This is a much simpler and cleaner option, and if you keep going with it you can see that you could probably even eliminate the require() and simply do the hmgetAsync in file1 with appropriate data returned by Cassandra. But it's hard to know without seeing your specific application needs. In any event, Promise-based patterns are generally much shorter and cleaner, but not always better - there IS a moderate performance overhead for using them. It's your call which way you go - either will work.

convert mongoose stream to array

I have worked with mongodb but quite new to mongoose ORM. I was trying to fetch data from a collection and the explain() output was showing 50ms. the overall time it was taking to fetch the data via mongoose was 9 seconds. Here is the query:
Node.find({'dataset': datasetRef}, function (err, nodes){
// handle error and data here
});
Then I applied index on the field I was querying on. The explain() output now showed 4ms. But the total time to retrieve data via mongoose did not change. Then i searched a bit and found that using lean() can help bring the performance of read queries in mongoose quite close to native mongodb
So I changed my query to:
Node.find({'dataset': datasetRef})
.lean()
.stream({transform: JSON.stringify})
.pipe(res)
This solved the performance issues completely. But the end result is a stream of JSON docs like this:
{var11: val11, var12: val12}{var21: val21, var22: val22} ...
How do I parse this to form an array of docs ? Or should I not be using stream at all ? In my opinion, there is no point using a stream if I am planning to form the array at backend, since I will then have to wait for all the docs to be read into memory. But I also think that parsing and creating the whole array at front end might be costly.
How can I achieve best performance in this case without clogging the network as well ?
UPDATE
I am trying to solve this problem using a through stream. However, I am not able to insert commas in between the JSON objects yet. See the code below:
res.write("[");
var through = require('through');
var tr = through(
function write(data){
this.queue(data.replace(/\}\{/g,"},{"));
}
);
var dbStream = db.node.find({'dataset': dataSetRef})
.lean()
.stream({'transform': JSON.stringify});
dbStream.on("end", function(){
res.write("]");
});
dbStream
.pipe(tr)
.pipe(res);
With this, I am able to get the "[" in the beginning and "]" at the end. However, still not able to get patten "}{" replaced with "},{". Not sure what am I doing wrong
UPDATE 2
Now figured out why the replace is not working. It appears that since I have specified the transform function as JSON.stringify, it reads one JSON object at a time and hence never encounter the pattern }{ since it never picks multiple JSON elements at a time.
Now I have modified my code, and written a custom transform function which does JSON.stringify and then appends a comma at the end. The only problem I am facing here is that I don't know when it is the last JSON object in the stream. Because I don't wanna append the comma in that case. At the moment, I append an empty JSON object once the end is encountered. But somehow this does not look like a convincing idea. Here is the code:
res.write("[");
function transform(data){
return JSON.stringify(data) + ",";
}
var dbStream = db.node.find({'dataset': dataSetRef})
.lean()
.stream({'transform': transform});
dbStream.on("end", function(){
res.write("{}]");
});
dbStream
.pipe(res);
The only problem I am facing here is that I don't know when it is the last JSON object in the stream.
But you do know which one is first. Knowing that, instead of appending the comma, you can prepend it to every object except the first one. In order to do that, set up your transform function inside a closure:
function transformFn(){
var first = true;
return function(data) {
if (first) {
first = false;
return JSON.stringify(data);
}
return "," + JSON.stringify(data);
}
}
Now you can just call that function and set it as your actual transform.
var transform = transformFn();
res.write("[");
var dbStream = db.node.find({'dataset': dataSetRef})
.lean()
.stream({'transform': transform});
dbStream.on("end", function(){
res.write("]");
});
dbStream
.pipe(res);
#cbajorin and #rckd both gave correct answers.
However, repeating this code all the time seems like a pain.
Hence my solution uses an extra Transform stream to achieve the same thing.
import { Transform } from 'stream'
class ArrayTransform extends Transform {
constructor(options) {
super(options)
this._index = 0
}
_transform(data, encoding, done) {
if (!(this._index++)) {
// first element, add opening bracket
this.push('[')
} else {
// following element, prepend comma
this.push(',')
}
this.push(data)
done()
}
_flush(done) {
if (!(this._index++)) {
// empty
this.push('[]')
} else {
// append closing bracket
this.push(']')
}
done()
}
}
Which in turn can be used as:
const toArray = new ArrayTransform();
Model.find(query).lean().stream({transform: JSON.stringify })
.pipe(toArray)
.pipe(res)
EDIT: added check for empty
I love #cdbajorin's solution, so i created a more readable version of it (ES6):
Products
.find({})
.lean()
.stream({
transform: () => {
let index = 0;
return (data) => {
return (!(index++) ? '[' : ',') + JSON.stringify(data);
};
}() // invoke
})
.on('end', () => {
res.write(']');
})
.pipe(res);
var mongoose = require('mongoose');
mongoose.connect('mongodb://localhost/shoppingdb');
var Sports = mongoose.model('sports', {});
var result = [];
var prefix_out = "your info";
Sports.find({"goods_category": "parts"}).
cursor().
on("data", function(doc){
//stream ---> string
var str = JSON.stringify(doc)
//sring ---> JSON
var json = JSON.parse(str);
//handle Your Property
json.handleYourProperty = prefix_out + json.imageURL;
result.push(result);
}).
on('error', function(err){
console.log(err);
}).
on('close', function(){
console.log(result);
});

Resources