I am trying to read a text file which is continuously growing (adding new lines at end) with very high rate, lets say ~100 lines per seconds where line size is approximate 200 characters.
I tried following , which is working but lagging by a minute and so.
var fs = require('fs');
var path = "D:\\testreadwrite.txt";
fs.watchFile(path, function() {
console.log('File Changed ...');
file = fs.readFileSync(path);
console.log('File content at : ' + new Date() + ' is \n' + file);
});
I really do not need synchronous reading but lagging > 1 minute is too high, and i do need entire file each time. All i need is , read the data and process line by line, for each new line coming. so i tried below code where i am planning to loop through and pass offset for each iteration. But this code is not working for some unknown reasons. Please help.
var fs = require('fs');
var path = "D:\\Work\\Jai Ho\\myapp\\public\\testreadwrite.txt";
fs.watch(path, function(event, filename) {
if(filename){
fs.stat(path, function(error, stats) {
fs.open(path, "r", function(error, fd) {
var buffer = new Buffer(stats.size);
fs.read(fd, buffer, 0, buffer.length, null, function(error, bytesRead, buffer) {
var data = buffer.toString("utf8");
console.log(data);
});
});
});
}
else{
console.log('filename not provided')
}
});
Related
I have about 1000 CSV files that need parsing. Each one contains about 1000 rows, for 1 million total records. The data need to be transformed and then saved to the db, which is why I have to do this through my app.
My problem is that the parser gradually slows down as it loops through the files, to the point where it will take forever to complete the run.
Here's how it's currently set up.
var files = [ file1Path, file2Path.... file1000Path ];
function parseFile(index) {
var startTime = new Date().getTime();
var filePath = files[index];
var stream = fs.createReadStream(filePath);
//parse using fast-csv npm module
csv.fromStream(stream, { config })
.on('data', function (row) {
transformAndSave(row);
})
.on('end', function () {
console.log( new Date().getTime() - startTime + " elapsed " );
parseFile(index + 1)
});
}
parseFile(0);
I've tried this a few different ways and it's basically the same thing every time. The first file completes in 2 seconds, by the 8th file we're at 5 or 6 seconds, later on it climbs to 24 seconds, etc. Other things I've tried include doing... files.forEach(function (file) { //run the parser }), doing batches of 100 at a time or even 5 at a time, and it makes no difference: it progressively slows down from a rate of 500 per second to 1 or 2 per second.
Does anybody have ideas for how I can prevent this slow down? Part of the reason could be that stream.on('end') completes before transformAndSave is finished, potentially creating a backlog. But at this point I'm out of ideas and would appreciate any help anyone could offer.
Thanks for much in advance!!
Daniel
note for Meteor people. I'm calling this function as a Meteor method. Not sure if that makes any difference, but in case it does, now you know.
Update
Here's is the log output demonstrating the steady rise in memory usage and processing time.
Seems like a resource problem, as in you're running out of memory. I would try an approach that doesn't use a recursive function which might allow resources to be released more readily. One approach could be to use async.
var Logger = require('arsenic-logger');
var fs = require('fs');
var async = require('async');
var csv = require('fast-csv');
var path = require('path');
Logger.echoMemoryUsage();
var testDir = path.resolve(__dirname, 'test');
fs.readdir(testDir, (err, files) => {
Logger.debug(files);
if (err) {
Logger.error(err);
}
async.mapLimit(files, 2, function(file, cb) {
var startTime = new Date().getTime();
var stream = fs.createReadStream(testDir+'/'+file);
Logger.debug("Reading: " + file);
config = {};
//parse using fast-csv npm module
csv.fromStream(stream, config)
.on('data', function(row) {
//Logger.debug(row);
//transformAndSave(row);
})
.on('error', function(err) {
Logger.error(err);
cb(err);
})
.on('end', function() {
Logger.debug(new Date().getTime() - startTime + " elapsed ");
setTimeout(cb, 1000);
});
}, function(err, results) {
Logger.info("Finished!");
process.exit(1);
});
});
Is there a way to read one symbol at a time in nodejs from file without storing the whole file in memory?
I found an answer for lines
I tried something like this but it doesn't help:
const stream = fs.createReadStream("walmart.dump", {
encoding: 'utf8',
fd: null,
bufferSize: 1,
});
stream.on('data', function(sym){
console.log(sym);
});
Readable stream has a read() method, where you can pass the length, in bytes, of every chunk to be read. For example:
var readable = fs.createReadStream("walmart.dump", {
encoding: 'utf8',
fd: null,
});
readable.on('readable', function() {
var chunk;
while (null !== (chunk = readable.read(1) /* here */)) {
console.log(chunk); // chunk is one byte
}
});
Here's a lower-level way to do it: fs.read(fd, buffer, offset, length, position, callback)
using:
const fs = require('fs');
// open file for reading, returns file descriptor
const fd = fs.openSync('your-file.txt','r');
function readOneCharFromFile(position, cb){
// only need to store one byte (one character)
const b = new Buffer(1);
fs.read(fd, b, 0, 1, position, function(err,bytesRead, buffer){
console.log('data => ', String(buffer));
cb(err,buffer);
});
}
you will have to increment the position, as you read the file, but it will work.
here's a quick example of how to read a whole file, character by character
Just for fun I wrote this complete script to do it, just pass in a different file path, and it should work
const async = require('async');
const fs = require('fs');
const path = require('path');
function read(fd, position, cb) {
let isByteRead = null;
let ret = new Buffer(0);
async.whilst(
function () {
return isByteRead !== false;
},
function (cb) {
readOneCharFromFile(fd, position++, function (err, bytesRead, buffer) {
if(err){
return cb(err);
}
isByteRead = !!bytesRead;
if(isByteRead){
ret = Buffer.concat([ret,buffer]);
}
cb(null);
});
},
function (err) {
cb(err, ret);
}
);
}
function readOneCharFromFile(fd, position, cb) {
// only need to store one byte (one character)
const b = new Buffer(1);
fs.read(fd, b, 0, 1, position, cb);
}
/// use your own file here
const file = path.resolve(__dirname + '/fixtures/abc.txt');
const fd = fs.openSync(file, 'r');
// start reading at position 0, position will be incremented
read(fd, 0, function (err, data) {
if (err) {
console.error(err.stack || err);
}
else {
console.log('data => ', String(data));
}
fs.closeSync(fd);
});
As you can see we increment the position integer every time we read the file. Hopefully the OS keeps the file in memory as we go. Using async.whilst() is OK, but I think for a more functional style it's better not to keep the state in the top of the function (ret and isByteRead). I will leave it as an exercise to the reader to implement this without using those stateful variables.
I have to read a very large csv file (> 80MB and growing).
I usually only have to parse the last 1% of the file. But getting to that part takes a few minutes.
Is there a way that I only start reading on line N?
Or alternatively could I read the stream from end to start?
I'm currently using fast-csv to read the file:
// convert csv into postgres copy file
csv.fromPath(filepath, {
headers: false
}).transform(function(data) {
// check if record meets condition
var dt = parseInt(data[0]);
var date = new Date(dt * 1000);
var mom = moment(date);
if (mom.isAfter('2014-01-01 00:00')) {
// transform data and return object
return transform(data);
}
return null;
}).pipe(csv.createWriteStream({
headers: true
})).pipe(fs.createWriteStream(outpath, {
encoding: "utf8"
})).on('finish', function() {
// do postgres import
});
Using a combination of node's fs.stat, fs.open, fs.read, you could find the size of the file and just read the last 1% into a buffer:
var fs = require('fs');
var filename = 'csv.csv';
fs.stat(filename, function(err, stat) {
if(err) throw err;
var bytesToRead = Math.ceil(0.01 * stat.size); // last 1%
var startingPosition = stat.size - bytesToRead;
var readBuffer = new Buffer(bytesToRead);
fs.open(filename, 'r', function(err, fd){
if(err) throw err;
fs.read(fd, readBuffer, 0, bytesToRead, startingPosition,
function(err, bytesRead){
if(err) throw err;
console.log(readBuffer.toString());
});
});
});
You couldn't start reading from line N because you would have to read it all to know where the newline characters are.
I'd like to watch a CSV file and get the newest records since it was changed. I'm running the following shell command to build a very simple csv file and append a new line every second:
rm test.csv & x=0 && while true; do echo "${x},${x},${x}" >> test.csv; x=$(($x+1)); sleep 1; done
The following code prints all the records of the file until the first change and then just emits the dashed line, as if it's not re-reading the file:
'use strict';
var fs = require('fs'),
dataFile = __dirname + '/server/data/test.csv',
csv = require('csv');
var parser = csv.parse({delimiter: ','}, function(err, data){
console.log(data);
});
var watcher = fs.watch(dataFile);
watcher.on('change', fileChange);
function fileChange(e, fn){
if (e) console.error(e)
fs.createReadStream(dataFile).pipe(parser);
console.log('-------')
}
Shouldn't the fileChange function re-read the file on every change? My ultimate plan here is to get both the previous array of lines and the current one and use lodash's difference function to return only the differences. If there's better way, I'm open to hear it though.
My guess is that fs.createReadStream() has opened the file and it's not being closed. So on the second event fs.createReadStream() fails. No bueno.
Try using fs.readFile() instead like this:
function fileChange(e, fn){
if (e) console.error(e)
fs.readFile(dataFile, function (err, data) {
if (err) throw err;
console.log(data);
console.log('-------')
});
};
See the documentation here: http://nodejs.org/api/fs.html#fs_fs_readfile_filename_options_callback
I ended up solving the issue by stating the file on change, and reading the difference in size to the stream data:
'use strict';
var fs = require('fs'),
dataFile = __dirname + '/server/data/test.csv',
readSize = 0,
csv = require('csv');
var parser = csv.parse();
parser.on('readable', function(data){
var record;
while(record = parser.read()){
console.log(record);
}
});
var watcher = fs.watch(dataFile);
watcher.on('change', fileChange);
// fires when the watched file changes
function fileChange(e, fn){
// get these syncronously
var stats = fs.statSync(dataFile);
// if it's smaller, wait half a second
if (stats.size <= readSize) {
setTimeout(fileChange, 500);
}
// read the stream offset
var stream = fs.createReadStream(dataFile, {start: readSize, end: stats.size});
stream.on('data', function(chunk){
parser.write(chunk.toString());
});
readSize = stats.size;
}
Any feedback on why this may not work would be appreciated.
I need to stream a file in base64 to an http endpoint using something like request or superagent. What is the best way to figure out what percentage of the file has been uploaded?
I assume I can create the read stream using something like:
fs.createReadStream('/tmp/cats.jpg', {encoding: 'base64'})
Any examples using one out of above libraries would be greatly appreciated.
I think you can use progress-stream.
Here is an example from the package:
var progress = require('progress-stream');
var fs = require('fs');
var stat = fs.statSync(filename);
var str = progress({
length: stat.size,
time: 100 /* ms */
});
str.on('progress', function(progress) {
console.log(progress);
/*
{
percentage: 9.05,
transferred: 949624,
length: 10485760,
remaining: 9536136,
eta: 42,
runtime: 3,
delta: 295396,
speed: 949624
}
*/
});
fs.createReadStream(filename)
.pipe(str)
.pipe(fs.createWriteStream(output));
I was looking for an answer to a similar issue and thanks to Alberto Zaccagni's answer, I was able to get some code working.
So for the people who don't want to piece the puzzle themselves, here is the code (edited for Stackoverflow):
var zipfile = "my_large_archive.zip";
// Get the size of the file
fs.stat(zipfile, function (err, stats) {
var zipSize = stats.size;
var uploadedSize = 0; // Incremented by on('data') to keep track of the amount of data we've uploaded
// Create a new read stream so we can plug events on it, and get the upload progress
var zipReadStream = fs.createReadStream(zipfile);
zipReadStream.on('data', function(buffer) {
var segmentLength = buffer.length;
// Increment the uploaded data counter
uploadedSize += segmentLength;
// Display the upload percentage
console.log("Progress:\t",((uploadedSize/zipSize*100).toFixed(2)+"%"));
});
// Some other events you might want for your code
zipReadStream.on('end', function() {
console.log("Event: end");
});
zipReadStream.on('close', function() {
console.log("Event: close");
});
var formData = require('form-data');
var form = new formData();
form.append('apikey', 'f4sd5f4sdf6ds456'); // Just some post parameters I need to send to the upload endpoint
form.append('file', zipReadStream); // The zip file, passed as a fs.createReadStream instance
// Submit the form and the file
form.submit('http://www.someserver.com/upload', function(err, res) {
if (err) {
console.log("Oups! We encountered an error :(\n\n", err);
return false;
}
console.log("Your file has been uploaded.");
res.resume(); // Fix is you use that code for a CLI, so that the execution will stop and let users enter new commands
});
});
In nodejs we have the Readable stream, it emits the data event when it receives a chunk of data, by knowing the file size you could easily keep track of how much data passes through the data event receiver and then update the percentage.
Get the file dimension with
require('fs').watchFile('yourfile', function () {
fs.stat('yourfile', function (err, stats) {
console.log(stats.size);
});
});