From node.js, which is faster, shell grep or fs.readFile? - node.js

I have a long running node.js process and I need to scan a log file for a pattern. I have at least two obvious choices: spawn a grep process or read the file using fs.read* and parse the buffer/stream in node.js. I haven't found a comparison of the two methods on the intarwebs. My question is twofold:
which is faster?
why might I prefer one technique over the other?

Here's my nodejs implementation, results are pretty much as expected:
small files run faster than a forked grep (files up to 2-3k short lines),
large files run slower. The larger the file, the bigger the difference.
(And perhaps the more complex the regex, the smaller the difference -- see
below.)
I used my own qfgets package for fast
line-at-a-time file i/o; there may be better ones out there, I don't know.
I saw an unexpected anomaly that I did not investigate: the below timings
are for the constant string regexp /foobar/. When I changed it to
/[f][o][o][b][a][r]/ to actually exercise the regex engine, grep slowed
down 3x, while node sped up! The 3x slowdown of grep is reproducible on the
command line.
filename = "/var/log/apache2/access.log"; // 2,540,034 lines, 187MB
//filename = "/var/log/messages"; // 25,703 lines, 2.5MB
//filename = "out"; // 2000 lines, 188K (head -2000 access.log)
//filename = "/etc/motd"; // 7 lines, 286B
regexp = /foobar/;
child_process = require('child_process');
qfgets = require('qfgets');
function grepWithFs( filename, regexp, done ) {
fp = new qfgets(filename, "r");
function loop() {
for (i=0; i<40; i++) {
line = fp.fgets();
if (line && line.match(regexp)) process.stdout.write(line);
}
if (!fp.feof()) setImmediate(loop);
else done();
}
loop();
}
function grepWithFork( filename, regexp, done ) {
cmd = "egrep '" + regexp.toString().slice(1, -1) + "' " + filename;
child_process.exec(cmd, {maxBuffer: 200000000}, function(err, stdout, stderr) {
process.stdout.write(stdout);
done(err);
});
}
The test:
function fptime() { t = process.hrtime(); return t[0] + t[1]*1e-9 }
t1 = fptime();
if (0) {
grepWithFs(filename, regexp, function(){
console.log("fs done", fptime() - t1);
});
}
else {
grepWithFork(filename, regexp, function(err){
console.log("fork done", fptime() - t1);
});
}
Results:
/**
results (all file contents memory resident, no disk i/o):
times in seconds, best run out of 5
/foobar/
fork fs
motd .00876 .00358 0.41 x 7 lines
out .00922 .00772 0.84 x 2000 lines
messages .0101 .0335 3.32 x 25.7 k lines
access.log .1367 1.032 7.55 x 2.54 m lines
/[f][o][o][b][a][r]/
access.log .4244 .8348 1.97 x 2.54 m lines
**/
(The above code was all one file, I split it up to avoid the scrollbar)
Edit: to highlight the key results:
185MB, 2.54 million lines, search RegExp /[f][o][o][b][a][r]/:
grepWithFs
elapsed: .83 sec
grepWithFork
elapsed: .42 sec

To answer this question, I wrote this little program.
#!/usr/local/bin/node
'use strict';
const fs = require('fs');
const log = '/var/log/maillog';
const fsOpts = { flag: 'r', encoding: 'utf8' };
const wantsRe = new RegExp(process.argv[2]);
function handleResults (err, data) {
console.log(data);
}
function grepWithFs (file, done) {
fs.readFile(log, fsOpts, function (err, data) {
if (err) throw (err);
let res = '';
data.toString().split(/\n/).forEach(function (line) {
if (wantsRe && !wantsRe.test(line)) return;
res += line + '\n';
});
done(null, res);
});
};
function grepWithShell (file, done) {
const spawn = require('child_process').spawn;
let res = '';
const child = spawn('grep', [ '-e', process.argv[2], file ]);
child.stdout.on('data', function (buffer) { res += buffer.toString(); });
child.stdout.on('end', function() { done(null, res); });
};
for (let i=0; i < 10; i++) {
// grepWithFs(log, handleResults);
grepWithShell(log, handleResults);
}
Then I alternately ran both functions inside a loop 10x and measured the time it took them to grep the result from a log file that's representative of my use case:
$ ls -alh /var/log/maillog
-rw-r--r-- 1 root wheel 37M Feb 8 16:44 /var/log/maillog
The file system is a pair of mirrored SSDs which are generally quick enough that they aren't the bottleneck. Here are the results:
grepWithShell
$ time node logreader.js 3E-4C03-86DD-FB6EF
real 0m0.238s
user 0m0.181s
sys 0m1.550s
grepWithFs
$ time node logreader.js 3E-4C03-86DD-FB6EF
real 0m6.599s
user 0m5.710s
sys 0m1.751s
The different is huge. Using a shell grep process is dramatically faster. As Andras points out, node's I/O can be tricky, and I didn't try any other fs.read* methods. If there's a better way, please do point it out (preferably with similar test scenario and results).

forking a grep is simpler and quicker, and grep would most likely run faster and use less cpu. Although fork has a moderately high overhead (much more than opening a file), you would only fork once and stream the results. Plus it can be tricky to get good performance out of node's file i/o.

Related

Handle huge amount of data in node.js through stdin

Old question title:
"node.js readline form net.Socket (process.stdin) cause error: heap out of memory (conversion of net.Socket Duplex to Readable stream)"
... I've changed it because nobody answered and it seems like important question in node.js ecosystem.
Question is how to solve problem of "heap out of memory" error when reading line by line from huge stdin? Error is not happening when you dump stdout to file (eg: test.log) and read to 'readline' interface through fs.createReadStream('test.log').
Looks like process.stdin is not Readable stream as it is mentioned here:
https://nodejs.org/api/process.html#process_process_stdin
To reproduce the issue I've created two scripts. First is to just generate huge amount of data (a.js file):
// a.js
// loop in this form generates about 7.5G of data
// you can check yourself running:
// node a.js > test.log && ls -lah test.log
// will return
// -rw-r--r-- 1 sd staff 7.5G 31 Jan 22:29 test.log
for (let i = 0 ; i < 8000000 ; i += 1 ) {
console.log(`${i} ${".".repeat(1000)}\n`);
}
The script to consume this through bash pipe with readline (b.js file):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: process.stdin, // doesn't work
//input: fs.createReadStream('test.log'), // works
});
let s;
rl.on('line', line => {
// deliberaty commented out to demonstrate that issue
// has nothing to do beyond readline and process.stdin
// s = line.substring(0, 7);
//
// if (s === '100 ...' || s === '400 ...' || s === '7500000') {
//
// process.stdout.write(`${line}\n`);
// }
});
rl.on('error', e => {
console.log('general error', e)
})
Now when you run;
node a.js | node b.js
it will result with error:
FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
but if you swap
const rl = readline.createInterface({
input: process.stdin,
});
to
const rl = readline.createInterface({
input: fs.createReadStream('test.log')
});
and run
node a.js > test.log
node b.js
everything works fine
Problem comes down actually to how to convert net.Socket to fully functional Readable stream?, if it is possible at all.
Edit:
Basically my problem is that it seems like it is not possible to handle huge amount of data from stdin as a stream which is natural for Unix style pipes. So despite the fact node.js is brilliant in handling streams you can't write program that would handle huge amount of data through unix style pipes.
It would be totally not necessary in some cases to dump data to the hard drive and only after handle it with fs.createReadStream('test.log') only because of this limitation.
I thought that streams are all about handling huge amount of data (among other use cases) on the flight without saving it on hard drive.
You can always treat process.stdin as a normal NodeJS stream and handle the reading your self:
const os = require('os');
function onReadLine(line) {
// do stuff with line
console.info(line);
}
// read input and split into lines
let BUFF = '';
process.stdin.on('data', (buff) => {
const content = buff.toString('utf-8');
for (let i = 0; i < content.length; i++){
if (content[i] === os.EOL) {
onReadLine(BUFF);
BUFF = '';
} else {
BUFF += content[i];
}
}
});
// flush last line
process.stdin.on('end', () => {
if (BUFF.length > 0) {
onReadLine(BUFF);
}
});
Example:
// unix
cat ./somefile.txt | node ./script.js
// windows
Start-Process -FilePath "node" -ArgumentList #(".\script.js") -RedirectStandardInput .\somefile.txt -NoNewWindow -Wait
The problem is not input data size, not Node, but a faulty design of your data generator: it does not implement pausing/resuming data generation on request of consumer output stream. Instead of just pushing data to console.log(..) you should correctly interact with standard output stream, and correctly handle pause and resume signals from that stream.
The file input stream created by fs.createReadStream() is implemented properly, and it does pause/resumes as necessary, thus does not crash the code.

Node.js memory leak when reading and writing large files

I am currently trying to implement SPIMI index construction method in Node and I have ran into an issue.
The code is the following:
let fs = require("fs");
let path = require("path");
module.exports = {
fileStream: function (dirPath, fileStream) {
return buildFileStream(dirPath, fileStream);
},
buildSpimi: function (fileStream, outDir) {
let invIndex = {};
let sortedInvIndex = {};
let fileNameCount = 1;
let outputTXT = "";
let entryCounter = 0;
let resString = "";
fileStream.forEach((filePath, fileIndex) => {
let data = fs.readFileSync(filePath).toString('utf-8');
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(function (ch) { return ch.length != 0; });
data.forEach(token => {
//CHANGE THE SIZE IF NECESSARY (4e+?)
if (entryCounter > 100000) {
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
resString = "";
entryCounter = 0;
sortedInvIndex = {};
invIndex = {};
console.log(outputTXT + " - written;");
fileNameCount++;
};
if (invIndex[token] == undefined) {
invIndex[token] = [];
entryCounter++;
};
if (!invIndex[token].includes(fileIndex)) {
invIndex[token].push(fileIndex);
entryCounter++;
};
});
});
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
console.log(outputTXT + " - written;");
}
}
function buildFileStream(dirPath, fileStream) {
fileStream = fileStream || 0;
fs.readdirSync(dirPath).forEach(function (file) {
let filepath = path.join(dirPath, file);
let stat = fs.statSync(filepath);
if (stat.isDirectory()) {
fileStream = buildFileStream(filepath, fileStream);
} else {
fileStream.push(filepath);
}
});
return fileStream;
}
I am using the exported functions in a separate file:
let spimi = require("./spimi");
let outputDir = "/Users/me/Desktop/SPIMI_OUT/"
let inputDir = "/Users/me/Desktop/gutenberg/2/2";
fileStream = [];
let result = spimi.fileStream(inputDir, fileStream);
console.table(result)
console.log("Finished building the filestream");
let t0 = new Date();
spimi.buildSpimi(result, outputDir);
let t1 = new Date();
console.log(t1 - t0);
While this code kind of works when trying on relatively small volumes of data (I tested up to 1.5 GB), there is obviously a memory leak somewhere, as when monitoring the RAM usage I can see it going up as far as to 4-5 GB).
I spent quite a lot of time trying to figure out what might be the cause, but I still couldn't find the issue.
I would appreciate any hints on this!
Thanks!
Something to understand about the language and garbage collection in general is that this:
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(...)
creates three additional copies of your data. First, an uppercase copy. Then, a split array copy. Then, a filtered copy of the split array.
So, at this point, you have four copies of your data all in memory. All, but the filtered array are now eligible for garbage collection when the GC gets a chance to run, but if this data was initially large, you're going to be using at least 3x-4x as much memory as the filesize (depending upon how many array items are removed in your .filter() operation).
None of this is a leak, but it's a very big peak memory usage which can be a problem.
A more memory efficient way to process large files is to process them as a stream (not read them all into memory at once). You read a small size chunk (say 1024 bytes), process it, read a chunk, process it while being careful about chunk boundaries. If your file naturally has line boundaries, there are already pre-built solutions for processing line by line. If not, you can create your own chunk processing mechanism. We would have to see a sample of your data to make more specific chunk processing suggestions.
As another point, if you end up with a lot of keys in invIndex, then this line of code starts to become inefficient and you're doing it in your loop:
Object.keys(invIndex).sort()
This takes your object and gets all the keys in a temporary array which you use only for the purposes of updating the sortedInvIndex which is yet another copy of your data. So, right there alone, this set of code makes three copies of all your keys and two copies of all the values. And, it does it every time through your loop. Again, lots of peak memory usage that the GC won't normally clean up until your function is done.
A redesign to the way you process this data could probably reduce the peak memory usage by a factor of 100x. For memory efficiency, you want only the initial data, the final data representation and then just a little more used for temporary transformations to over be in use at the same time. You don't want to EVER be processing all the data multiple times because each time you do that, it creates yet another entire copy of all the data that contributes to peak memory usage.
If you show what the data input looks like and what data structure you're trying to end up with, I could probably take a crack at a much more efficient implementation.
Mykhailo, adding on to what jfriend said, it's actually not a memory leak. It's working as intended.
Something to consider is that readFile buffers the entire file! This will cause the huge memory bloat. Better alternative is to implement fs.createReadStream() which will only buffer the part of the file you're currently reading. Unfortunately, implementing that solution may require a full rewrite of your code as it returns fs.ReadStream which won't behave the way you're currently handling files Checkout this link and read the bottom of the section to see what I'm referencing

How to write incrementally to a text file and flush output

My Node.js program - which is an ordinary command line program that by and large doesn't do anything operationally unusual, nothing system-specific or asynchronous or anything like that - needs to write messages to a file from time to time, and then it will be interrupted with ^C and it needs the contents of the file to still be there.
I've tried using fs.createWriteStream but that just ends up with a 0-byte file. (The file does contain text if the program ends by running off the end of the main file, but that's not the scenario I have.)
I've tried using winston but that ends up not creating the file at all. (The file does contain text if the program ends by running off the end of the main file, but that's not the scenario I have.)
And fs.writeFile works perfectly when you have all the text you want to write up front, but doesn't seem to support appending a line at a time.
What is the recommended way to do this?
Edit: specific code I've tried:
var fs = require('fs')
var log = fs.createWriteStream('test.log')
for (var i = 0; i < 1000000; i++) {
console.log(i)
log.write(i + '\n')
}
Run for a few seconds, hit ^C, leaves a 0-byte file.
Turns out Node provides a lower level file I/O API that seems to work fine!
var fs = require('fs')
var log = fs.openSync('test.log', 'w')
for (var i = 0; i < 100000; i++) {
console.log(i)
fs.writeSync(log, i + '\n')
}
NodeJS doesn't work in the traditional way. It uses a single thread, so by running a large loop and doing I/O inside, you aren't giving it a chance (i.e. releasing the CPU) to do other async operations for eg: flushing memory buffer to actual file.
The logic must be - do one write, then pass your function (which invokes the write) as a callback to process.nextTick or as callback to the write stream's drain event (if buffer was full during last write).
Here's a quick and dirty version which does what you need. Notice that there are no long-running loops or other CPU blockage, instead I schedule my subsequent writes for future and return quickly, momentarily freeing up the CPU for other things.
var fs = require('fs')
var log = fs.createWriteStream('test.log');
var i = 0;
function my_write() {
if (i++ < 1000000)
{
var res = log.write("" + i + "\r\n");
if (!res) {
log.on('drain',my_write);
} else {
process.nextTick(my_write);
}
console.log("Done" + i + " " + res + "\r\n");
}
}
my_write();
This function might also be helpful.
/**
* Write `data` to a `stream`. if the buffer is full will block
* until it's flushed and ready to be written again.
* [see](https://nodejs.org/api/stream.html#stream_writable_write_chunk_encoding_callback)
*/
export function write(data, stream) {
return new Promise((resolve, reject) => {
if (stream.write(data)) {
process.nextTick(resolve);
} else {
stream.once("drain", () => {
stream.off("error", reject);
resolve();
});
stream.once("error", reject);
}
});
}
You are writing into file using for loop which is bad but that's other case. First of all createWriteStream doesn't close the file automatically you should call close.
If you call close immediately after for loop it will close without writing because it's async.
For more info read here: https://nodejs.org/api/fs.html#fs_fs_createwritestream_path_options
Problem is async function inside for loop.

NODEJS: Uncork() method on writable stream doesn't really flush the data

I am writing quite simple application to transform data - read one file and write to another. Files are relatively large - 2 gb. However, what I found is that flush to the file system is not happening, on cork-uncork cycle, it only happens on end(), so the end() basically hangs the system until it's fully flashed.
I simplified the example so it just writes a line to the stream a lot of times.
var PREFIX = 'E:\\TEST\\';
var line = 'AA 11 999999999 20160101 123456 20160101 AAA 00 00 00 0 0 0 2 2 0 0 20160101 0 00';
var fileSystem = require('fs');
function writeStrings() {
var stringsCount = 0;
var stream = fileSystem.createWriteStream(PREFIX +'output.txt');
stream.once('drain', function () {
console.log("drained");
});
stream.once('open', function (fileDescriptor) {
var started = false;
console.log('writing file ');
stream.cork();
for (i = 0; i < 2000000; i++) {
stream.write(line + i);
if (i % 10000 == 0) {
// console.log('passed ',i);
}
if (i % 100000 == 0) {
console.log('uncorcked ',i,stream._writableState.writing);
stream.uncork();
stream.cork();
}
}
stream.end();
});
stream.once('finish', function () {
console.log("done");
});
}
writeStrings();
going inside the node _stream_writable.js, I found that it flushes the buffer only on this condition:
if (!state.writing &&
!state.corked &&
!state.finished &&
!state.bufferProcessing &&
state.buffer.length)
clearBuffer(this, state);
and, as you can see from example, the writing flag doesn't set back after first uncork(), which prevents the uncork to flush.
Also, I don't see drain events evoking at all. Playing with highWaterMark doesn't help (actually doesn't seems to have effect on anything). Manually setting the writing to false (+ some other flags) indeed helped but this is surely wrong.
Am I am misunderstanding the concept of this?
From the node.js documentation I found that number of uncork() should match the number of cork() call, I am not seeing matching stream.uncork() call for stream.cork(), which is called before the for loop. That might be the issue.
Looking at a guide on nodejs.org, you aren't supposed to call stream.uncork() twice in the same event loop. Here is an excerpt:
// Using .uncork() twice here makes two calls on the C++ layer, rendering the
// cork/uncork technique useless.
ws.cork();
ws.write('hello ');
ws.write('world ');
ws.uncork();
ws.cork();
ws.write('from ');
ws.write('Matteo');
ws.uncork();
// The correct way to write this is to utilize process.nextTick(), which fires
// on the next event loop.
ws.cork();
ws.write('hello ');
ws.write('world ');
process.nextTick(doUncork, ws);
ws.cork();
ws.write('from ');
ws.write('Matteo');
process.nextTick(doUncork, ws);
// as a global function
function doUncork(stream) {
stream.uncork();
}
.cork() can be called as many times we want, we just need to be careful to call .uncork() the same amount of times to make it flow again.

Fast file copy with progress information in Node.js?

Is there any chance to copy large files with Node.js with progress infos and fast?
Solution 1 : fs.createReadStream().pipe(...) = useless, up to 5 slower than native cp
See: Fastest way to copy file in node.js, progress information is possible (with npm package 'progress-stream' ):
fs = require('fs');
fs.createReadStream('test.log').pipe(fs.createWriteStream('newLog.log'));
The only problem with that way is that it takes easily 5 times longer compared "cp source dest". See also the appendix below for the full test code.
Solution 2 : rsync ---info=progress2 = same slow as solution 1 = useless
Solution 3 : My last resort, write a native module for node.js, using "CoreUtils" (linux sources for cp and others) or other functions as shown in Fast file copy with progress
Does anyone knows better than solution 3? I'd like to avoid native code but it seems the best fit.
thanks! any package recommendations or hints (tried all fs**) are welcome!
Appendix:
test code, using pipe and progress:
var path = require('path');
var progress = require('progress-stream');
var fs = require('fs');
var _source = path.resolve('../inc/big.avi');// 1.5GB
var _target= '/tmp/a.avi';
var stat = fs.statSync(_source);
var str = progress({
length: stat.size,
time: 100
});
str.on('progress', function(progress) {
console.log(progress.percentage);
});
function copyFile(source, target, cb) {
var cbCalled = false;
var rd = fs.createReadStream(source);
rd.on("error", function(err) {
done(err);
});
var wr = fs.createWriteStream(target);
wr.on("error", function(err) {
done(err);
});
wr.on("close", function(ex) {
done();
});
rd.pipe(str).pipe(wr);
function done(err) {
if (!cbCalled) {
console.log('done');
cb && cb(err);
cbCalled = true;
}
}
}
copyFile(_source,_target);
update: a fast (with detailed progress!) C version is implemented here: https://github.com/MidnightCommander/mc/blob/master/src/filemanager/file.c#L1480. Seems the best place to go from :-)
One aspect that may slow down the process is related to console.log. Take a look into this code:
const fs = require('fs');
const sourceFile = 'large.exe'
const destFile = 'large_copy.exe'
console.time('copying')
fs.stat(sourceFile, function(err, stat){
const filesize = stat.size
let bytesCopied = 0
const readStream = fs.createReadStream(sourceFile)
readStream.on('data', function(buffer){
bytesCopied+= buffer.length
let porcentage = ((bytesCopied/filesize)*100).toFixed(2)
console.log(porcentage+'%') // run once with this and later with this line commented
})
readStream.on('end', function(){
console.timeEnd('copying')
})
readStream.pipe(fs.createWriteStream(destFile));
})
Here are the execution times copying a 400mb file:
with console.log: 692.950ms
without console.log: 382.540ms
cpy and cp-file both support progress reporting
I have the same issue. I want to copy large files as fast as possible and want progress information. I created a test utility that tests the different copy methods:
https://www.npmjs.com/package/copy-speed-test
You can run it simply with:
npx copy-speed-test --source someFile.zip --destination someNonExistentFolder
It does a native copy using child_process.exec(), a copy file using fs.copyFile and it uses createReadStream with a variety of different buffer sizes (you can change buffer sizes by passing them on the command line. run npx copy-speed-test -h for more info.
Some things I learnt:
fs.copyFile is just as fast as native
you can get quite inconsistent results on all these methods, particularly when copying from and to the same disc and with SSDs
if using a large buffer then createReadStream is nearly as good as the other methods
if you use a very large buffer then the progress is not very accurate.
The last point is because the progress is based on the read stream, not the write stream. if copying a 1.5GB file and your buffer is 1GB then the progress immediately jumps to 66% then jumps to 100% and you then have to wait whilst the write stream finishes writing. I don't think that you can display the progress of the write stream.
If you have the same issue I would recommend that you run these tests with similar file sizes to what you will be dealing with and across similar media. My end use case is copying a file from an SD card plugged into a raspberry pi and copied across a network to a NAS so that's what I was the scenario that I ran the tests for.
I hope someone other than me finds it useful!
I solved a similar problem (using Node v8 or v10) by changing the buffer size. I think the default buffer size is around 16kb, which fills and empties quickly but requires a full cycle around the event loop for each operation. I changed the buffer to 1MB and writing a 2GB image fell from taking around 30 minutes to 5, which sounds similar to what you are seeing. My image was also decompressed on the fly, which possibly exacerbated the problem. Documentation on stream buffering has been in the manual since at least Node v6: https://nodejs.org/api/stream.html#stream_buffering
Here are the key code components you can use:
let gzSize = 1; // do not initialize divisors to 0
const hwm = { highWaterMark: 1024 * 1024 }
const inStream = fs.createReadStream( filepath, hwm );
// Capture the filesize for showing percentages
inStream.on( 'open', function fileOpen( fdin ) {
inStream.pause(); // wait for fstat before starting
fs.fstat( fdin, function( err, stats ) {
gzSize = stats.size;
// openTargetDevice does a complicated fopen() for the output.
// This could simply be inStream.resume()
openTargetDevice( gzSize, targetDeviceOpened );
});
});
inStream.on( 'data', function shaData( data ) {
const bytesRead = data.length;
offset += bytesRead;
console.log( `Read ${offset} of ${gzSize} bytes, ${Math.floor( offset * 100 / gzSize )}% ...` );
// Write to the output file, etc.
});
// Once the target is open, I convert the fd to a stream and resume the input.
// For the purpose of example, note only that the output has the same buffer size.
function targetDeviceOpened( error, fd, device ) {
if( error ) return exitOnError( error );
const writeOpts = Object.assign( { fd }, hwm );
outStream = fs.createWriteStream( undefined, writeOpts );
outStream.on( 'open', function fileOpen( fdin ) {
// In a simpler structure, this is in the fstat() callback.
inStream.resume(); // we have the _input_ size, resume read
});
// [...]
}
I have not made any attempt to optimize these further; the result is similar to what I get on the commandline using 'dd' which is my benchmark.
I left in converting a file descriptor to a stream and using the pause/resume logic so you can see how these might be useful in more complicated situations than the simple fs.statSync() in your original post. Otherwise, this is simply adding the highWaterMark option to Tulio's answer.
Here is what I'm trying to use now, it copies 1 file with progress:
String.prototype.toHHMMSS = function () {
var sec_num = parseInt(this, 10); // don't forget the second param
var hours = Math.floor(sec_num / 3600);
var minutes = Math.floor((sec_num - (hours * 3600)) / 60);
var seconds = sec_num - (hours * 3600) - (minutes * 60);
if (hours < 10) {hours = "0"+hours;}
if (minutes < 10) {minutes = "0"+minutes;}
if (seconds < 10) {seconds = "0"+seconds;}
return hours+':'+minutes+':'+seconds;
}
var purefile="20200811140938_0002.MP4";
var filename="/sourceDir"+purefile;
var output="/destinationDir"+purefile;
var progress = require('progress-stream');
var fs = require('fs');
const convertBytes = function(bytes) {
const sizes = ["Bytes", "KB", "MB", "GB", "TB"]
if (bytes == 0) {
return "n/a"
}
const i = parseInt(Math.floor(Math.log(bytes) / Math.log(1024)))
if (i == 0) {
return bytes + " " + sizes[i]
}
return (bytes / Math.pow(1024, i)).toFixed(1) + " " + sizes[i]
}
var copiedFileSize = fs.statSync(filename).size;
var str = progress({
length: copiedFileSize, // length(integer) - If you already know the length of the stream, then you can set it. Defaults to 0.
time: 200, // time(integer) - Sets how often progress events are emitted in ms. If omitted then the default is to do so every time a chunk is received.
speed: 1, // speed(integer) - Sets how long the speedometer needs to calculate the speed. Defaults to 5 sec.
// drain: true // drain(boolean) - In case you don't want to include a readstream after progress-stream, set to true to drain automatically. Defaults to false.
// transferred: false// transferred(integer) - If you want to set the size of previously downloaded data. Useful for a resumed download.
});
/*
{
percentage: 9.05,
transferred: 949624,
length: 10485760,
remaining: 9536136,
eta: 42,
runtime: 3,
delta: 295396,
speed: 949624
}
*/
str.on('progress', function(progress) {
console.log(progress.percentage+'%');
console.log('eltelt: '+progress.runtime.toString().toHHMMSS() + 's / hátra: ' + progress.eta.toString().toHHMMSS()+'s');
console.log(convertBytes(progress.speed)+"/s"+' '+progress.speed);
});
//const hwm = { highWaterMark: 1024 * 1024 } ;
var hrstart = process.hrtime(); // measure the copy time
var rs=fs.createReadStream(filename)
.pipe(str)
.pipe(fs.createWriteStream(output, {emitClose: true}).on("close", () => {
var hrend = process.hrtime(hrstart);
var timeInMs = (hrend[0]* 1000000000 + hrend[1]) / 1000000000;
var finalSpeed=convertBytes(copiedFileSize/timeInMs);
console.log('Done: file copy: '+ finalSpeed+"/s");
console.info('Execution time (hr): %ds %dms', hrend[0], hrend[1] / 1000000);
}) );
Refer to https://www.npmjs.com/package/fsprogress.
With that package, you can track progress while you are copying or moving files. The progress tracking is event and method call based so its very convenient to use.
You can provide options to do a lot of things. eg. total number of file for concurrent operation, chunk size to read from a file at a time.
It was tested for single file upto 17GB and directories up to i dont really remember but it was pretty large. And also :D, it is safe to use for large file(s).
So, go ahead and have a look at it whether it matches your expectations or if it is what you are looking for :D

Resources