I have streaming data coming in with IP address. I want to translate the IP to longitude and latitude before putting the data into my database.
This is what I was doing but it is causing some issues. I also tried putting locationObject outside the for loop. That weirdly is using a lot of memory. I know this is blocking code but it should be fast. Though I see memory issue as data object is coming from a stream continuously ans each data object is huge.
for (var i ==0; i < data.length; i++){
if (data.client_ip !== null) {
var locationLookup = maxmind.openSync('./GeoIP2-City.mmdb');
var ip = data.client_ip;
var maxmindObj = locationLookup.get(ip);
locationObject.country = maxmindObj.country.names.en;
locationObject.latitude = maxmindObj.location.latitude;
locationObject.longitude = maxmindObj.location.longitude;
}
}
Again trying to put maxmind.openSync('./GeoIP2-City.mmdb'); outside fr loop is causing huge increase in memory.
The Other option is to use nonblocking code
maxmind.open('/path/to/GeoLite2-City.mmdb', (err, cityLookup) => {
var city = cityLookup.get('66.6.44.4');
});
But I don't think this is a good dea to put this inside a loop.
How can I handle this? I am getting data object every minute
https://github.com/runk/node-maxmind
I'm not sure why you think reading the database file for each iteration of the loop would be fast ("blocking code" doesn't equal "fast code"), it's much better to read the database file once and then loop over data.
maxmind.openSync() will read the entire database into memory, which is mentioned in the README:
Be careful with sync version! Since mmdb files are quite large
(city database is about 100Mb) fs.readFileSync blocks whole
process while it reads file into buffer.
If you don't have memory to spare, the only other option would be to open the file asynchronously. Again, not inside the loop, but outside of it:
maxmind.open("./GeoIP2-City.mmdb", (err, locationLookup) => {
for (var i = 0; i < data.length; i++) {
if (data.client_ip !== null) {
var ip = data.client_ip;
var maxmindObj = locationLookup.get(ip);
locationObject.country = maxmindObj.country.names.en;
locationObject.latitude = maxmindObj.location.latitude;
locationObject.longitude = maxmindObj.location.longitude;
}
}
});
The only thing I am worried is over time I call this function so many times. every time my consumers read jsonObject from kakfa (happening every minute). is there a much better way to optimize that as well. so I call this function every minute. How can I better optimize this further
function processData(jsonObject) {
maxmind.open('./GeoIP2-City.mmdb', function(err, locationLookup) {
if (err) {
logger.error('something went wrong on maxmind fetch', err);
}
for (var i = 0; i < jsonObject.length; i++) { ...}
})
}
Related
I am currently trying to implement SPIMI index construction method in Node and I have ran into an issue.
The code is the following:
let fs = require("fs");
let path = require("path");
module.exports = {
fileStream: function (dirPath, fileStream) {
return buildFileStream(dirPath, fileStream);
},
buildSpimi: function (fileStream, outDir) {
let invIndex = {};
let sortedInvIndex = {};
let fileNameCount = 1;
let outputTXT = "";
let entryCounter = 0;
let resString = "";
fileStream.forEach((filePath, fileIndex) => {
let data = fs.readFileSync(filePath).toString('utf-8');
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(function (ch) { return ch.length != 0; });
data.forEach(token => {
//CHANGE THE SIZE IF NECESSARY (4e+?)
if (entryCounter > 100000) {
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
resString = "";
entryCounter = 0;
sortedInvIndex = {};
invIndex = {};
console.log(outputTXT + " - written;");
fileNameCount++;
};
if (invIndex[token] == undefined) {
invIndex[token] = [];
entryCounter++;
};
if (!invIndex[token].includes(fileIndex)) {
invIndex[token].push(fileIndex);
entryCounter++;
};
});
});
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
console.log(outputTXT + " - written;");
}
}
function buildFileStream(dirPath, fileStream) {
fileStream = fileStream || 0;
fs.readdirSync(dirPath).forEach(function (file) {
let filepath = path.join(dirPath, file);
let stat = fs.statSync(filepath);
if (stat.isDirectory()) {
fileStream = buildFileStream(filepath, fileStream);
} else {
fileStream.push(filepath);
}
});
return fileStream;
}
I am using the exported functions in a separate file:
let spimi = require("./spimi");
let outputDir = "/Users/me/Desktop/SPIMI_OUT/"
let inputDir = "/Users/me/Desktop/gutenberg/2/2";
fileStream = [];
let result = spimi.fileStream(inputDir, fileStream);
console.table(result)
console.log("Finished building the filestream");
let t0 = new Date();
spimi.buildSpimi(result, outputDir);
let t1 = new Date();
console.log(t1 - t0);
While this code kind of works when trying on relatively small volumes of data (I tested up to 1.5 GB), there is obviously a memory leak somewhere, as when monitoring the RAM usage I can see it going up as far as to 4-5 GB).
I spent quite a lot of time trying to figure out what might be the cause, but I still couldn't find the issue.
I would appreciate any hints on this!
Thanks!
Something to understand about the language and garbage collection in general is that this:
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(...)
creates three additional copies of your data. First, an uppercase copy. Then, a split array copy. Then, a filtered copy of the split array.
So, at this point, you have four copies of your data all in memory. All, but the filtered array are now eligible for garbage collection when the GC gets a chance to run, but if this data was initially large, you're going to be using at least 3x-4x as much memory as the filesize (depending upon how many array items are removed in your .filter() operation).
None of this is a leak, but it's a very big peak memory usage which can be a problem.
A more memory efficient way to process large files is to process them as a stream (not read them all into memory at once). You read a small size chunk (say 1024 bytes), process it, read a chunk, process it while being careful about chunk boundaries. If your file naturally has line boundaries, there are already pre-built solutions for processing line by line. If not, you can create your own chunk processing mechanism. We would have to see a sample of your data to make more specific chunk processing suggestions.
As another point, if you end up with a lot of keys in invIndex, then this line of code starts to become inefficient and you're doing it in your loop:
Object.keys(invIndex).sort()
This takes your object and gets all the keys in a temporary array which you use only for the purposes of updating the sortedInvIndex which is yet another copy of your data. So, right there alone, this set of code makes three copies of all your keys and two copies of all the values. And, it does it every time through your loop. Again, lots of peak memory usage that the GC won't normally clean up until your function is done.
A redesign to the way you process this data could probably reduce the peak memory usage by a factor of 100x. For memory efficiency, you want only the initial data, the final data representation and then just a little more used for temporary transformations to over be in use at the same time. You don't want to EVER be processing all the data multiple times because each time you do that, it creates yet another entire copy of all the data that contributes to peak memory usage.
If you show what the data input looks like and what data structure you're trying to end up with, I could probably take a crack at a much more efficient implementation.
Mykhailo, adding on to what jfriend said, it's actually not a memory leak. It's working as intended.
Something to consider is that readFile buffers the entire file! This will cause the huge memory bloat. Better alternative is to implement fs.createReadStream() which will only buffer the part of the file you're currently reading. Unfortunately, implementing that solution may require a full rewrite of your code as it returns fs.ReadStream which won't behave the way you're currently handling files Checkout this link and read the bottom of the section to see what I'm referencing
My Node.js program - which is an ordinary command line program that by and large doesn't do anything operationally unusual, nothing system-specific or asynchronous or anything like that - needs to write messages to a file from time to time, and then it will be interrupted with ^C and it needs the contents of the file to still be there.
I've tried using fs.createWriteStream but that just ends up with a 0-byte file. (The file does contain text if the program ends by running off the end of the main file, but that's not the scenario I have.)
I've tried using winston but that ends up not creating the file at all. (The file does contain text if the program ends by running off the end of the main file, but that's not the scenario I have.)
And fs.writeFile works perfectly when you have all the text you want to write up front, but doesn't seem to support appending a line at a time.
What is the recommended way to do this?
Edit: specific code I've tried:
var fs = require('fs')
var log = fs.createWriteStream('test.log')
for (var i = 0; i < 1000000; i++) {
console.log(i)
log.write(i + '\n')
}
Run for a few seconds, hit ^C, leaves a 0-byte file.
Turns out Node provides a lower level file I/O API that seems to work fine!
var fs = require('fs')
var log = fs.openSync('test.log', 'w')
for (var i = 0; i < 100000; i++) {
console.log(i)
fs.writeSync(log, i + '\n')
}
NodeJS doesn't work in the traditional way. It uses a single thread, so by running a large loop and doing I/O inside, you aren't giving it a chance (i.e. releasing the CPU) to do other async operations for eg: flushing memory buffer to actual file.
The logic must be - do one write, then pass your function (which invokes the write) as a callback to process.nextTick or as callback to the write stream's drain event (if buffer was full during last write).
Here's a quick and dirty version which does what you need. Notice that there are no long-running loops or other CPU blockage, instead I schedule my subsequent writes for future and return quickly, momentarily freeing up the CPU for other things.
var fs = require('fs')
var log = fs.createWriteStream('test.log');
var i = 0;
function my_write() {
if (i++ < 1000000)
{
var res = log.write("" + i + "\r\n");
if (!res) {
log.on('drain',my_write);
} else {
process.nextTick(my_write);
}
console.log("Done" + i + " " + res + "\r\n");
}
}
my_write();
This function might also be helpful.
/**
* Write `data` to a `stream`. if the buffer is full will block
* until it's flushed and ready to be written again.
* [see](https://nodejs.org/api/stream.html#stream_writable_write_chunk_encoding_callback)
*/
export function write(data, stream) {
return new Promise((resolve, reject) => {
if (stream.write(data)) {
process.nextTick(resolve);
} else {
stream.once("drain", () => {
stream.off("error", reject);
resolve();
});
stream.once("error", reject);
}
});
}
You are writing into file using for loop which is bad but that's other case. First of all createWriteStream doesn't close the file automatically you should call close.
If you call close immediately after for loop it will close without writing because it's async.
For more info read here: https://nodejs.org/api/fs.html#fs_fs_createwritestream_path_options
Problem is async function inside for loop.
var pass = require('./pass.js');
var fs = require('fs');
var path = "password.txt";
var name ="admin";
var
remaining = "",
lineFeed = "\r\n",
lineNr = 0;
var log =
fs.createReadStream(path, { encoding: 'utf-8' })
.on('data', function (chunk) {
// store the actual chunk into the remaining
remaining = remaining.concat(chunk);
// look that we have a linefeed
var lastLineFeed = remaining.lastIndexOf(lineFeed);
// if we don't have any we can continue the reading
if (lastLineFeed === -1) return;
var
current = remaining.substring(0, lastLineFeed),
lines = current.split(lineFeed);
// store from the last linefeed or empty it out
remaining = (lastLineFeed > remaining.length)
? remaining.substring(lastLineFeed + 1, remaining.length)
: "";
for (var i = 0, length = lines.length; i < length; i++) {
// process the actual line
var account={
username:name,
password:lines[i],
};
pass.test(account);
}
})
.on('end', function (close) {
// TODO I'm not sure this is needed, it depends on your data
// process the reamining data if needed
if (remaining.length > 0) {
var account={
username:name,
password:remaining,
};
pass.test(account);
};
});
I tried to do something like test password of account "admin", pass.test is a function to test the password, I download a weak password dictionary with a large number of lines,so I search for way to read that many lines of weak password,but with code above, the lines array became too large ,and run out of memory,what should I do?
Insofar as my limited understanding goes, you need to watch a 1GB limit, which I believe is imposed by the V8 engine, actually. (Here's a link, actually saying the limit is 1.4 GB, currently, and lists the different params used to change this manually.) Depending on where you host your node app(s), you can increase this limit, by a param set on the command line when node is started. Again, see the linked article for a few ways to do this.
Also, you might want to make sure that, whenever possible, you use buffers, instead of converting things like data streams (from a DB or other things, for instance) to arrays/whatever, as this will then load the entire dataset into memory. As long as it lives in a buffer, it doesn't contribute to the total memory footprint of your app.
And actually, one thing that doesn't make sense, and that seems to be very inefficient in your app, is that, on reading each chunk of data in, you then check your username against EVERY username you've amassed so far, in your lines array, instead of the LAST one. What your app should do is keep track of the last username and password combo you've read in, and then delete all data before this user, in your remaining variable, so you keep your memory down. And since it's not a hold all repository for every line of your password file anymore, you should probably retitle it something like buffer or something. This means that you'd remove your for loop, since you're already "looping" through the data in your password file, by reading it in, chunk by chunk.
I know that when developing in node, you should always try to avoid blocking (sync) functions and go with async functions, however, I did a little test to see how they compare.
I need to open a JSON file that contains i18n data (like date and time formats, etc) and pass that data to a class that uses this data to format numbers, etc in my view.
It would be kind of awkward to start wrapping all the class's methods inside callbacks, so if possible, I would use the synchronous version instead.
console.time('one');
console.time('two');
fs.readFile( this.dir + "/" + locale + ".json", function (err, data) {
if (err) cb( err );
console.timeEnd('one');
});
var data = fs.readFileSync( this.dir + "/" + locale + ".json" );
console.timeEnd('two');
This results in the following lines in my console:
two: 1ms
one: 159ms
It seems that fs.readFileSync is about 150 times faster than fs.readFile - it takes about 1 ms to load a 50KB JSON file (minified). All my JSON files are around 50-100KB.
I was also thinking maybe somehow memoizing or saving this JSON data to the session so that the file is read-only once per session (or when the user changes their locale). I'm not entirely sure how to do that, it's just an idea.
Is it okay to use fs.readFileSync in my case or will I get in trouble later?
No, it is not OK to use a blocking API call in a node server as you describe. Your site's responsiveness to many concurrent connections will take a huge hit. It's also just blatantly violating the #1 principle of node.
The key to node working is that while it is waiting on IO, it is doing CPU/memory processing at the same time. This requires asynchronous calls exclusively. So if you have 100 clients reading 100 JSON files, node can ask the OS to read those 100 files but while waiting for the OS to return the file data when it is available, node can be processing other aspects of those 100 network requests. If you have a single synchronous call, ALL of your client processing stops entirely while that operation completes. So client number 100's connection waits with no processing whatsoever while you read files for clients 1, 2, 3, 4, and so on sequentially. This is Failville.
Here's another analogy. If you went to a restaurant and were the only customer, you would probably get faster service if a single person sat you, took your order, cooked it, served it to you, and handled the bill without the coordination overhead of dealing with the host/hostess, server, head chef, line cooks, cashiers, etc. However, with 100 customers in the restaurant, the extra coordination means things happen in parallel and the overall responsiveness of the restaurant is increased way beyond what it would be if a single person were trying to handle 100 customers on their own.
You are blocking the callback of the asynchronous read with your synchronous read, remember single thread.
Now I understand that the time difference is still amazing, but you should try with a file that is much, much longer to read and imagine that many, many clients will do the same, only then the overhead will pay off.
That should answer your question, yes you will run into trouble if you are serving thousands
of requests with blocking IO.
After a lot of time and a lot of learn & practice I've tried once more and I've found the answer and I can show some example:
const fs = require('fs');
const syncTest = () => {
let startTime = +new Date();
const results = [];
const files = [];
for (let i=0, len=4; i<len; i++) {
files.push(fs.readFileSync(`file-${i}.txt`));
};
for (let i=0, len=360; i<len; i++) results.push(Math.sin(i), Math.cos(i));
console.log(`Sync version: ${+new Date() - startTime}`);
};
const asyncTest = () => {
let startTime = +new Date();
const results = [];
const files = [];
for (let i=0, len=4; i<len; i++) {
fs.readFile(`file-${i}.txt`, file => files.push(file));
};
for (let i=0, len=360; i<len; i++) results.push(Math.sin(i), Math.cos(i));
console.log(`Async version: ${+new Date() - startTime}`);
};
syncTest();
asyncTest();
Yes, it's correct, to deal with the asynchronous way in a server-side environment. But if their use case is different like to generating the build as in client-side JS project, meanwhile reading and writing the JSON files for different flavors.
It doesn't affect that much. Although we needed a rapid manner to create a minified build for deployment (here synchronous comes into the picture).
for more info and library
I've tried to check the real, measurable difference in a speed between fs.readFileSync() and fs.readFile() for downloading 3 different files which are on SD card and I've added between this downloads some math calculation and I don't understand where is the difference in speed which is always showed on node pictures when node is faster also in simple operation like downloading 3 times the same file and the time for this operation is close to time which is needed for downloading 1 time this file.
I understand that this is no doubtly useful that server during downloading some file is able to doing other job but a lot of time on youtube or in books there are some diagrams which are not precise because when you have a situation like below async node is slower then sync in reading small files(like below: 85kB, 170kB, 255kB).
var fs = require('fs');
var startMeasureTime = () => {
var start = new Date().getTime();
return start;
};
// synch version
console.log('Start');
var start = startMeasureTime();
for (var i = 1; i<=3; i++) {
var fileName = `Lorem-${i}.txt`;
var fileContents = fs.readFileSync(fileName);
console.log(`File ${1} was downloaded(${fileContents.length/1000}KB) after ${new Date().getTime() - start}ms from start.`);
if (i === 1) {
var hardMath = 3*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9;
};
};
// asynch version
setImmediate(() => {
console.log('Start');
var start = startMeasureTime();
for (var i = 1; i<=3; i++) {
var fileName = `Lorem-${i}.txt`;
fs.readFile(fileName, {encoding: 'utf8'}, (err, fileContents) => {
console.log(`File ${1} was downloaded(${fileContents.length/1000}KB) after ${new Date().getTime() - start}ms from start.`);
});
if (i === 1) {
var hardMath = 3*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9*54/25*35/46*255/34/9;
};
};
});
This is from console:
Start
File 1 was downloaded(255.024KB) after 2ms from start.
File 1 was downloaded(170.016KB) after 5ms from start.
File 1 was downloaded(85.008KB) after 6ms from start.
Start
File 1 was downloaded(255.024KB) after 10ms from start.
File 1 was downloaded(85.008KB) after 11ms from start.
File 1 was downloaded(170.016KB) after 12ms from start.
I have a massive for loop and I want to allow I/O to continue while I'm processing. Maybe every 10,000 or so iterations. Any way for me to allow for additional I/O this way?
A massive for loop is just you blocking the entire server.
You have two options, either put the for loop in a new thread, or make it asynchronous.
var data = [];
var next = function(i) {
// do thing with data [i];
process.nextTick(next.bind(this, i + 1));
};
process.nextTick(next.bind(this, 0));
I don't recommend the latter. Your just implementing naive time splicing which the OS level process scheduler can do better then you.
var exec = require("child_process").exec
var s = exec("node " + filename, function (err, stdout, stderr) {
stdout.on("data", function() {
// handle data
});
});
Alternatively use something like hook.io to manage processes for you.
Actually you probably want to aggressively redesign your codebase if you have a blocking for loop.
Maybe something like this to break your loop into chunks...
Instead of:
for (var i=0; i<len; i++) {
doSomething(i);
}
Something like:
var i = 0, limit;
while (i < len) {
limit = (i+10000);
if (limit > len)
limit = len;
process.nextTick(function(){
for (; i<limit; i++) {
doSomething(i);
}
});
}
}
The nextTick() call gives a chance for other events to get in there, but it still does most looping synchronously which (I'm guessing) will be a lot faster than creating a new event for every iteration. And obviously, you can experiment with the number (10,000) till you get the results you want.
In theory, you could also use setTimeout() rather than nextTick(), in case it turns out that giving other processes a somewhat bigger "time-slice" helps. That gives you one more variable (the timeout milliseconds) that you can use for tuning.