Collect changes in a file every fixed time in node js - node.js

I have an external program that is streaming data to a csv file every now and then (but quit a lot).
I want to collect every 10 seconds all the changed data and do some processing on it.
means I want to process only lines I didn't processed before.
this is the basic code:
function myFunction() {
var loop = setInterval(
() =>
{
var instream = fs.createReadStream("rawData.csv"); //should somehow include only new data since last cycle
var outstream = fs.createWriteStream("afterProcessing.csv");
someProcessing(instream, outstream);
outstream.on('finish', () => {
sendBackResults("afterProcessing.csv");
});
//will exit the loop when 'run' flag will change to false
if(!run) ? clearInterval(loop) : console.log(`\nStill Running...\n`) ;
} , 10000 );
}
Now, I tried to work with chokidar and fs.watch but I couldn't figure out how to use them in this case.

fs.createReadStream can take a start parameter
options can include start and end values to read a range of bytes from
the file instead of the entire file. Both start and end are inclusive
and start counting at 0
So you need to save the last position read, and use it on start.
You can get that using: instream.bytesRead.
let bytesRead = 0;
instream.on('end', () => {
bytesRead = instream.bytesRead;
});

Related

How to write incrementally to a text file and flush output

My Node.js program - which is an ordinary command line program that by and large doesn't do anything operationally unusual, nothing system-specific or asynchronous or anything like that - needs to write messages to a file from time to time, and then it will be interrupted with ^C and it needs the contents of the file to still be there.
I've tried using fs.createWriteStream but that just ends up with a 0-byte file. (The file does contain text if the program ends by running off the end of the main file, but that's not the scenario I have.)
I've tried using winston but that ends up not creating the file at all. (The file does contain text if the program ends by running off the end of the main file, but that's not the scenario I have.)
And fs.writeFile works perfectly when you have all the text you want to write up front, but doesn't seem to support appending a line at a time.
What is the recommended way to do this?
Edit: specific code I've tried:
var fs = require('fs')
var log = fs.createWriteStream('test.log')
for (var i = 0; i < 1000000; i++) {
console.log(i)
log.write(i + '\n')
}
Run for a few seconds, hit ^C, leaves a 0-byte file.
Turns out Node provides a lower level file I/O API that seems to work fine!
var fs = require('fs')
var log = fs.openSync('test.log', 'w')
for (var i = 0; i < 100000; i++) {
console.log(i)
fs.writeSync(log, i + '\n')
}
NodeJS doesn't work in the traditional way. It uses a single thread, so by running a large loop and doing I/O inside, you aren't giving it a chance (i.e. releasing the CPU) to do other async operations for eg: flushing memory buffer to actual file.
The logic must be - do one write, then pass your function (which invokes the write) as a callback to process.nextTick or as callback to the write stream's drain event (if buffer was full during last write).
Here's a quick and dirty version which does what you need. Notice that there are no long-running loops or other CPU blockage, instead I schedule my subsequent writes for future and return quickly, momentarily freeing up the CPU for other things.
var fs = require('fs')
var log = fs.createWriteStream('test.log');
var i = 0;
function my_write() {
if (i++ < 1000000)
{
var res = log.write("" + i + "\r\n");
if (!res) {
log.on('drain',my_write);
} else {
process.nextTick(my_write);
}
console.log("Done" + i + " " + res + "\r\n");
}
}
my_write();
This function might also be helpful.
/**
* Write `data` to a `stream`. if the buffer is full will block
* until it's flushed and ready to be written again.
* [see](https://nodejs.org/api/stream.html#stream_writable_write_chunk_encoding_callback)
*/
export function write(data, stream) {
return new Promise((resolve, reject) => {
if (stream.write(data)) {
process.nextTick(resolve);
} else {
stream.once("drain", () => {
stream.off("error", reject);
resolve();
});
stream.once("error", reject);
}
});
}
You are writing into file using for loop which is bad but that's other case. First of all createWriteStream doesn't close the file automatically you should call close.
If you call close immediately after for loop it will close without writing because it's async.
For more info read here: https://nodejs.org/api/fs.html#fs_fs_createwritestream_path_options
Problem is async function inside for loop.

NODEJS: Uncork() method on writable stream doesn't really flush the data

I am writing quite simple application to transform data - read one file and write to another. Files are relatively large - 2 gb. However, what I found is that flush to the file system is not happening, on cork-uncork cycle, it only happens on end(), so the end() basically hangs the system until it's fully flashed.
I simplified the example so it just writes a line to the stream a lot of times.
var PREFIX = 'E:\\TEST\\';
var line = 'AA 11 999999999 20160101 123456 20160101 AAA 00 00 00 0 0 0 2 2 0 0 20160101 0 00';
var fileSystem = require('fs');
function writeStrings() {
var stringsCount = 0;
var stream = fileSystem.createWriteStream(PREFIX +'output.txt');
stream.once('drain', function () {
console.log("drained");
});
stream.once('open', function (fileDescriptor) {
var started = false;
console.log('writing file ');
stream.cork();
for (i = 0; i < 2000000; i++) {
stream.write(line + i);
if (i % 10000 == 0) {
// console.log('passed ',i);
}
if (i % 100000 == 0) {
console.log('uncorcked ',i,stream._writableState.writing);
stream.uncork();
stream.cork();
}
}
stream.end();
});
stream.once('finish', function () {
console.log("done");
});
}
writeStrings();
going inside the node _stream_writable.js, I found that it flushes the buffer only on this condition:
if (!state.writing &&
!state.corked &&
!state.finished &&
!state.bufferProcessing &&
state.buffer.length)
clearBuffer(this, state);
and, as you can see from example, the writing flag doesn't set back after first uncork(), which prevents the uncork to flush.
Also, I don't see drain events evoking at all. Playing with highWaterMark doesn't help (actually doesn't seems to have effect on anything). Manually setting the writing to false (+ some other flags) indeed helped but this is surely wrong.
Am I am misunderstanding the concept of this?
From the node.js documentation I found that number of uncork() should match the number of cork() call, I am not seeing matching stream.uncork() call for stream.cork(), which is called before the for loop. That might be the issue.
Looking at a guide on nodejs.org, you aren't supposed to call stream.uncork() twice in the same event loop. Here is an excerpt:
// Using .uncork() twice here makes two calls on the C++ layer, rendering the
// cork/uncork technique useless.
ws.cork();
ws.write('hello ');
ws.write('world ');
ws.uncork();
ws.cork();
ws.write('from ');
ws.write('Matteo');
ws.uncork();
// The correct way to write this is to utilize process.nextTick(), which fires
// on the next event loop.
ws.cork();
ws.write('hello ');
ws.write('world ');
process.nextTick(doUncork, ws);
ws.cork();
ws.write('from ');
ws.write('Matteo');
process.nextTick(doUncork, ws);
// as a global function
function doUncork(stream) {
stream.uncork();
}
.cork() can be called as many times we want, we just need to be careful to call .uncork() the same amount of times to make it flow again.

Fast file copy with progress information in Node.js?

Is there any chance to copy large files with Node.js with progress infos and fast?
Solution 1 : fs.createReadStream().pipe(...) = useless, up to 5 slower than native cp
See: Fastest way to copy file in node.js, progress information is possible (with npm package 'progress-stream' ):
fs = require('fs');
fs.createReadStream('test.log').pipe(fs.createWriteStream('newLog.log'));
The only problem with that way is that it takes easily 5 times longer compared "cp source dest". See also the appendix below for the full test code.
Solution 2 : rsync ---info=progress2 = same slow as solution 1 = useless
Solution 3 : My last resort, write a native module for node.js, using "CoreUtils" (linux sources for cp and others) or other functions as shown in Fast file copy with progress
Does anyone knows better than solution 3? I'd like to avoid native code but it seems the best fit.
thanks! any package recommendations or hints (tried all fs**) are welcome!
Appendix:
test code, using pipe and progress:
var path = require('path');
var progress = require('progress-stream');
var fs = require('fs');
var _source = path.resolve('../inc/big.avi');// 1.5GB
var _target= '/tmp/a.avi';
var stat = fs.statSync(_source);
var str = progress({
length: stat.size,
time: 100
});
str.on('progress', function(progress) {
console.log(progress.percentage);
});
function copyFile(source, target, cb) {
var cbCalled = false;
var rd = fs.createReadStream(source);
rd.on("error", function(err) {
done(err);
});
var wr = fs.createWriteStream(target);
wr.on("error", function(err) {
done(err);
});
wr.on("close", function(ex) {
done();
});
rd.pipe(str).pipe(wr);
function done(err) {
if (!cbCalled) {
console.log('done');
cb && cb(err);
cbCalled = true;
}
}
}
copyFile(_source,_target);
update: a fast (with detailed progress!) C version is implemented here: https://github.com/MidnightCommander/mc/blob/master/src/filemanager/file.c#L1480. Seems the best place to go from :-)
One aspect that may slow down the process is related to console.log. Take a look into this code:
const fs = require('fs');
const sourceFile = 'large.exe'
const destFile = 'large_copy.exe'
console.time('copying')
fs.stat(sourceFile, function(err, stat){
const filesize = stat.size
let bytesCopied = 0
const readStream = fs.createReadStream(sourceFile)
readStream.on('data', function(buffer){
bytesCopied+= buffer.length
let porcentage = ((bytesCopied/filesize)*100).toFixed(2)
console.log(porcentage+'%') // run once with this and later with this line commented
})
readStream.on('end', function(){
console.timeEnd('copying')
})
readStream.pipe(fs.createWriteStream(destFile));
})
Here are the execution times copying a 400mb file:
with console.log: 692.950ms
without console.log: 382.540ms
cpy and cp-file both support progress reporting
I have the same issue. I want to copy large files as fast as possible and want progress information. I created a test utility that tests the different copy methods:
https://www.npmjs.com/package/copy-speed-test
You can run it simply with:
npx copy-speed-test --source someFile.zip --destination someNonExistentFolder
It does a native copy using child_process.exec(), a copy file using fs.copyFile and it uses createReadStream with a variety of different buffer sizes (you can change buffer sizes by passing them on the command line. run npx copy-speed-test -h for more info.
Some things I learnt:
fs.copyFile is just as fast as native
you can get quite inconsistent results on all these methods, particularly when copying from and to the same disc and with SSDs
if using a large buffer then createReadStream is nearly as good as the other methods
if you use a very large buffer then the progress is not very accurate.
The last point is because the progress is based on the read stream, not the write stream. if copying a 1.5GB file and your buffer is 1GB then the progress immediately jumps to 66% then jumps to 100% and you then have to wait whilst the write stream finishes writing. I don't think that you can display the progress of the write stream.
If you have the same issue I would recommend that you run these tests with similar file sizes to what you will be dealing with and across similar media. My end use case is copying a file from an SD card plugged into a raspberry pi and copied across a network to a NAS so that's what I was the scenario that I ran the tests for.
I hope someone other than me finds it useful!
I solved a similar problem (using Node v8 or v10) by changing the buffer size. I think the default buffer size is around 16kb, which fills and empties quickly but requires a full cycle around the event loop for each operation. I changed the buffer to 1MB and writing a 2GB image fell from taking around 30 minutes to 5, which sounds similar to what you are seeing. My image was also decompressed on the fly, which possibly exacerbated the problem. Documentation on stream buffering has been in the manual since at least Node v6: https://nodejs.org/api/stream.html#stream_buffering
Here are the key code components you can use:
let gzSize = 1; // do not initialize divisors to 0
const hwm = { highWaterMark: 1024 * 1024 }
const inStream = fs.createReadStream( filepath, hwm );
// Capture the filesize for showing percentages
inStream.on( 'open', function fileOpen( fdin ) {
inStream.pause(); // wait for fstat before starting
fs.fstat( fdin, function( err, stats ) {
gzSize = stats.size;
// openTargetDevice does a complicated fopen() for the output.
// This could simply be inStream.resume()
openTargetDevice( gzSize, targetDeviceOpened );
});
});
inStream.on( 'data', function shaData( data ) {
const bytesRead = data.length;
offset += bytesRead;
console.log( `Read ${offset} of ${gzSize} bytes, ${Math.floor( offset * 100 / gzSize )}% ...` );
// Write to the output file, etc.
});
// Once the target is open, I convert the fd to a stream and resume the input.
// For the purpose of example, note only that the output has the same buffer size.
function targetDeviceOpened( error, fd, device ) {
if( error ) return exitOnError( error );
const writeOpts = Object.assign( { fd }, hwm );
outStream = fs.createWriteStream( undefined, writeOpts );
outStream.on( 'open', function fileOpen( fdin ) {
// In a simpler structure, this is in the fstat() callback.
inStream.resume(); // we have the _input_ size, resume read
});
// [...]
}
I have not made any attempt to optimize these further; the result is similar to what I get on the commandline using 'dd' which is my benchmark.
I left in converting a file descriptor to a stream and using the pause/resume logic so you can see how these might be useful in more complicated situations than the simple fs.statSync() in your original post. Otherwise, this is simply adding the highWaterMark option to Tulio's answer.
Here is what I'm trying to use now, it copies 1 file with progress:
String.prototype.toHHMMSS = function () {
var sec_num = parseInt(this, 10); // don't forget the second param
var hours = Math.floor(sec_num / 3600);
var minutes = Math.floor((sec_num - (hours * 3600)) / 60);
var seconds = sec_num - (hours * 3600) - (minutes * 60);
if (hours < 10) {hours = "0"+hours;}
if (minutes < 10) {minutes = "0"+minutes;}
if (seconds < 10) {seconds = "0"+seconds;}
return hours+':'+minutes+':'+seconds;
}
var purefile="20200811140938_0002.MP4";
var filename="/sourceDir"+purefile;
var output="/destinationDir"+purefile;
var progress = require('progress-stream');
var fs = require('fs');
const convertBytes = function(bytes) {
const sizes = ["Bytes", "KB", "MB", "GB", "TB"]
if (bytes == 0) {
return "n/a"
}
const i = parseInt(Math.floor(Math.log(bytes) / Math.log(1024)))
if (i == 0) {
return bytes + " " + sizes[i]
}
return (bytes / Math.pow(1024, i)).toFixed(1) + " " + sizes[i]
}
var copiedFileSize = fs.statSync(filename).size;
var str = progress({
length: copiedFileSize, // length(integer) - If you already know the length of the stream, then you can set it. Defaults to 0.
time: 200, // time(integer) - Sets how often progress events are emitted in ms. If omitted then the default is to do so every time a chunk is received.
speed: 1, // speed(integer) - Sets how long the speedometer needs to calculate the speed. Defaults to 5 sec.
// drain: true // drain(boolean) - In case you don't want to include a readstream after progress-stream, set to true to drain automatically. Defaults to false.
// transferred: false// transferred(integer) - If you want to set the size of previously downloaded data. Useful for a resumed download.
});
/*
{
percentage: 9.05,
transferred: 949624,
length: 10485760,
remaining: 9536136,
eta: 42,
runtime: 3,
delta: 295396,
speed: 949624
}
*/
str.on('progress', function(progress) {
console.log(progress.percentage+'%');
console.log('eltelt: '+progress.runtime.toString().toHHMMSS() + 's / hátra: ' + progress.eta.toString().toHHMMSS()+'s');
console.log(convertBytes(progress.speed)+"/s"+' '+progress.speed);
});
//const hwm = { highWaterMark: 1024 * 1024 } ;
var hrstart = process.hrtime(); // measure the copy time
var rs=fs.createReadStream(filename)
.pipe(str)
.pipe(fs.createWriteStream(output, {emitClose: true}).on("close", () => {
var hrend = process.hrtime(hrstart);
var timeInMs = (hrend[0]* 1000000000 + hrend[1]) / 1000000000;
var finalSpeed=convertBytes(copiedFileSize/timeInMs);
console.log('Done: file copy: '+ finalSpeed+"/s");
console.info('Execution time (hr): %ds %dms', hrend[0], hrend[1] / 1000000);
}) );
Refer to https://www.npmjs.com/package/fsprogress.
With that package, you can track progress while you are copying or moving files. The progress tracking is event and method call based so its very convenient to use.
You can provide options to do a lot of things. eg. total number of file for concurrent operation, chunk size to read from a file at a time.
It was tested for single file upto 17GB and directories up to i dont really remember but it was pretty large. And also :D, it is safe to use for large file(s).
So, go ahead and have a look at it whether it matches your expectations or if it is what you are looking for :D

Reading file in segments of X number of lines

I have a file with a lot of entries (10+ million), each representing a partial document that is being saved to a mongo database (based on some criteria, non-trivial).
To avoid overloading the database (which is doing other operations at the same time), I wish to read in chunks of X lines, wait for them to finish, read the next X lines, etc.
Is there any way to use any of the fscallback-mechanisms to also "halt" progress at a certain point, without blocking the entire program? From what I can tell they will all run from start to finish with no way of stopping it, unless you stop reading the file entirely.
The issues is that because of the file size, memory also becomes an issue and because of the time the updates take, a LOT of the data will be held in memory exceeding the 1 GB limit and causing the program to crash. Secondarily, as I said, I don't want to queue 1 million updates and completely stress the mongo database.
Any and all suggestions welcome.
UPDATE: Final solution using line-reader (available via npm) below, in pseudo-code.
var lineReader = require('line-reader');
var filename = <wherever you get it from>;
lineReader(filename, function(line, last, cb) {
//
// Do work here, line contains the line data
// last is true if it's the last line in the file
//
function checkProcessed(callback) {
if (doneProcessing()) { // Implement doneProcessing to check whether whatever you are doing is done
callback();
}
else {
setTimeout(function() { checkProcessed(callback) }, 100); // Adjust timeout according to expecting time to process one line
}
}
checkProcessed(cb);
});
This is implemented to make sure doneProcessing() returns true before attempting to work on more lines - this means you can effectively throttle whatever you are doing.
I don't use MongoDB and I'm not an expert in using Lazy, but I think something like below might work or give you some ideas. (note that I have not tested this code)
var fs = require('fs'),
lazy = require('lazy');
var readStream = fs.createReadStream('yourfile.txt');
var file = lazy(readStream)
.lines // ask to read stream line by line
.take(100) // and read 100 lines at a time.
.join(function(onehundredlines){
readStream.pause(); // pause reading the stream
writeToMongoDB(onehundredLines, function(err){
// error checking goes here
// resume the stream 1 second after MongoDB finishes saving.
setTimeout(readStream.resume, 1000);
});
});
}

Stdout flush for NodeJS?

Is there any stdout flush for nodejs just like python or other languages?
sys.stdout.write('some data')
sys.stdout.flush()
Right now I only saw process.stdout.write() for nodejs.
process.stdout is a WritableStream object, and the method WritableStream.write() automatically flushes the stream (unless it was explicitly corked). However, it will return true if the flush was successful, and false if the kernel buffer was full and it can't write yet. If you need to write several times in succession, you should handle the drain event.
See the documentation for write.
In newer NodeJS versions, you can pass a callback to .write(), which will be called once the data is flushed:
sys.stdout.write('some data', () => {
console.log('The data has been flushed');
});
This is exactly the same as checking .write() result and registering to the drain event:
let write = sys.stdout.write('some data');
if (!write) {
sys.stdout.once('drain', () => {
console.log('The data has been flushed');
});
}
write returns true if the data has been flushed. If it returns false, you can wait for the 'drain' event.
I think there is no flush, because that would be a blocking operation.
There is another function stdout which to clear last output to the terminal which is kind of work like flush
function flush() {
process.stdout.clearLine();
process.stdout.cursorTo(0);
}
var total = 5000;
var current = 0;
var percent = 0;
var waitingTime = 500;
setInterval(function() {
current += waitingTime;
percent = Math.floor((current / total) * 100);
flush();
process.stdout.write(`downloading ... ${percent}%`);
if (current >= total) {
console.log("\nDone.");
clearInterval(this);
}
}, waitingTime);
cursorTo will move the cursor to position 0 which is the starting point
use the flush function before stdout.write because it will clear the screen, if you put after you will not see any output

Resources