I have to read a very large csv file (> 80MB and growing).
I usually only have to parse the last 1% of the file. But getting to that part takes a few minutes.
Is there a way that I only start reading on line N?
Or alternatively could I read the stream from end to start?
I'm currently using fast-csv to read the file:
// convert csv into postgres copy file
csv.fromPath(filepath, {
headers: false
}).transform(function(data) {
// check if record meets condition
var dt = parseInt(data[0]);
var date = new Date(dt * 1000);
var mom = moment(date);
if (mom.isAfter('2014-01-01 00:00')) {
// transform data and return object
return transform(data);
}
return null;
}).pipe(csv.createWriteStream({
headers: true
})).pipe(fs.createWriteStream(outpath, {
encoding: "utf8"
})).on('finish', function() {
// do postgres import
});
Using a combination of node's fs.stat, fs.open, fs.read, you could find the size of the file and just read the last 1% into a buffer:
var fs = require('fs');
var filename = 'csv.csv';
fs.stat(filename, function(err, stat) {
if(err) throw err;
var bytesToRead = Math.ceil(0.01 * stat.size); // last 1%
var startingPosition = stat.size - bytesToRead;
var readBuffer = new Buffer(bytesToRead);
fs.open(filename, 'r', function(err, fd){
if(err) throw err;
fs.read(fd, readBuffer, 0, bytesToRead, startingPosition,
function(err, bytesRead){
if(err) throw err;
console.log(readBuffer.toString());
});
});
});
You couldn't start reading from line N because you would have to read it all to know where the newline characters are.
Related
I am trying to read a text file which is continuously growing (adding new lines at end) with very high rate, lets say ~100 lines per seconds where line size is approximate 200 characters.
I tried following , which is working but lagging by a minute and so.
var fs = require('fs');
var path = "D:\\testreadwrite.txt";
fs.watchFile(path, function() {
console.log('File Changed ...');
file = fs.readFileSync(path);
console.log('File content at : ' + new Date() + ' is \n' + file);
});
I really do not need synchronous reading but lagging > 1 minute is too high, and i do need entire file each time. All i need is , read the data and process line by line, for each new line coming. so i tried below code where i am planning to loop through and pass offset for each iteration. But this code is not working for some unknown reasons. Please help.
var fs = require('fs');
var path = "D:\\Work\\Jai Ho\\myapp\\public\\testreadwrite.txt";
fs.watch(path, function(event, filename) {
if(filename){
fs.stat(path, function(error, stats) {
fs.open(path, "r", function(error, fd) {
var buffer = new Buffer(stats.size);
fs.read(fd, buffer, 0, buffer.length, null, function(error, bytesRead, buffer) {
var data = buffer.toString("utf8");
console.log(data);
});
});
});
}
else{
console.log('filename not provided')
}
});
So I'm creating a class and ultimately want to create a method that takes a file on an SFTP server and produces a readstream that can be piped into other streams / functions. I'm most of the way there, except my readStream is acting strangely. Here's the relevant code:
const Client = require('ssh2').Client,
Readable = require('stream').Readable,
async = require('async');
/**
* Class Definition stuff
* ......
*/
getStream(get) {
const self = this;
const rs = new Readable;
rs._read = function() {
const read = this;
self.conn.on('ready', function(){
self.conn.sftp(function(err,sftp) {
if(err) return err;
sftp.open(get, 'r', function(err, fd){
sftp.fstat(fd, function(err, stats) {
let bufferSize = stats.size,
chunkSize = 512,//bytes
buffer = new Buffer(bufferSize),
bytesRead = 0;
async.whilst(
function () {
return bytesRead < bufferSize;
},
function (done) {
sftp.read(fd, buffer, bytesRead, chunkSize, bytesRead,
function (err, bytes, buff) {
if (err) return done(err);
// console.log(buff.toString('utf8'));
read.push(buff);
bytesRead += bytes;
done();
});
},
function (err) {
if (err) console.log(err);
read.push(null);
sftp.close(fd);
}
);
});
});
});
}).connect(self.connectionObj);
}
return rs;
}
Elsewhere, I would call this method like so:
let sftp = new SFTP(credentials);
sftp.getStream('/path/file.csv')
.pipe(toStuff);
.pipe(toOutput);
So, long story short. During the SFTP.read operation read.push(buff) keeps pushing the same first part of the file over and over. However, when I console.log(buff) it correctly streams the full file?
So I'm scratching my head wondering what I'm doing wrong with the read stream that it's only pushing the beginning of the file and not continuing on to the next part of the buffer.
Here's the docs on SSH2 SFTP client: https://github.com/mscdex/ssh2-streams/blob/master/SFTPStream.md
I used this SO question as inspiration for what I wrote above: node.js fs.read() example
This is similar/related: Reading file from SFTP server using Node.js and SSH2
Ok, after a lot of trouble, I realized I was making a couple mistakes. First, the _read function is called every time the stream is ready to read more data, which means, the SFTP connection was being started everytime _read was called. This also meant the sftp.read() function was starting over each time, reseting the starting point back to the beginning.
I needed a way to first setup the connection, then read and stream the file data, so I chose the library noms. Here's the final code if anyone is interested:
getStream (get) {
const self = this;
let connection,
fileData,
buffer,
totalBytes = 0,
bytesRead = 0;
return nom(
// _read function
function(size, next) {
const read = this;
// Check if we're done reading
if(bytesRead === totalBytes) {
connection.close(fileData);
connection.end();
self.conn.end();
console.log('done');
return read.push(null);
}
// Make sure we read the last bit of the file
if ((bytesRead + size) > totalBytes) {
size = (totalBytes - bytesRead);
}
// Read each chunk of the file
connection.read(fileData, buffer, bytesRead, size, bytesRead,
function (err, byteCount, buff, pos) {
// console.log(buff.toString('utf8'));
// console.log('reading');
bytesRead += byteCount;
read.push(buff);
next();
}
);
},
// Before Function
function(start) {
// setup the connection BEFORE we start _read
self.conn.on('ready', function(){
self.conn.sftp(function(err,sftp) {
if(err) return err;
sftp.open(get, 'r', function(err, fd){
sftp.fstat(fd, function(err, stats) {
connection = sftp;
fileData = fd;
totalBytes = stats.size;
buffer = new Buffer(totalBytes);
console.log('made connection');
start();
});
});
});
}).connect(self.connectionObj);
})
}
Always looking for feedback. This doesn't run quite as fast as I'd hope, so let me know if you have ideas on speeding up the stream.
Any idea why fs.read cannot behave as fs.readSync?
My code is very simple, just read out the songs file chunk by chunk. And I find out with fs.readSync function that the song file can read out 512 bytes everytime while with fs.read function, there is no log info printed out and if i delete the while(readPosition < fileSize), it executes only one time.
var chunkSize = 512; //the chunk size that will be read every time
var readPostion = 0; //the first byte which will be read from the file.
var fileSize =0;
var fs=require('fs');
//var Buffer = require("buffer");
//var songsBuf = Buffer.alloc(512);
var songsBuf = new Buffer(chunkSize);
fs.open('/media/sdcard/song.mp3','r',function(err,fd){
if(err)
throw err;
console.log("The file had been opened");
var fileSize = fs.fstatSync(fd).size;
console.log("The total size of this file is:%d Bytes",fileSize);
console.log("Start to read the file chunk by chunk");
//read the file in sync mode
while(readPostion<fileSize)
{
fs.readSync(fd,songsBuf,0,chunkSize,readPostion);
if(readPostion+chunkSize>fileSize)
chunkSize = fileSize-readPostion;
readPostion+=chunkSize;
console.log("the read position is %d",readPostion);
console.log("The chunk size is %d",chunkSize);
console.log(songsBuf);
}
//the code above can readout the file chunk by chunk but the below one cannot
//read the file in Async mode.
while(readPostion<fileSize)
{
// console.log("ff");
fs.read(fd,songsBuf,0,chunkSize,1,function(err,byteNum,buffer){
if(err)
throw err;
console.log("Start to read from %d byte",readPostion);
console.log("Total bytes are %d",byteNum);
console.log(buffer);
if(readPostion+chunkSize>fileSize)
chunkSize = fileSize-readPostion; //if the files to read is smaller than one chunk
readPostion+=chunkSize;
});
}
fs.close(fd);
});
You can do this with the async library.
async.whilst(
function () { return readPostion < fileSize },
function (callback) {
fs.read(fd, songsBuf, 0, chunkSize, 1, function (err, byteNum, buffer) {
if (err) return callback(err)
console.log("Start to read from %d byte",readPostion);
console.log("Total bytes are %d",byteNum);
console.log(buffer);
if(readPostion + chunkSize > fileSize)
chunkSize = fileSize - readPostion; //if the files to read is smaller than one chunk
readPostion += chunkSize
callback(null, songBuffer)
})
},
function (err, n) {
if (err) console.error(err)
fs.close(fd)
// Do something with songBuffer here
}
)
I have the following text file ("test.txt") that I want to manipulate in node.js:
world
food
I want to remove the first line so that food becomes the first line instead. How can I do that?
var fs = require('fs')
fs.readFile(filename, 'utf8', function(err, data)
{
if (err)
{
// check and handle err
}
// data is the file contents as a single unified string
// .split('\n') splits it at each new-line character and all splits are aggregated into an array (i.e. turns it into an array of lines)
// .slice(1) returns a view into that array starting at the second entry from the front (i.e. the first element, but slice is zero-indexed so the "first" is really the "second")
// .join() takes that array and re-concatenates it into a string
var linesExceptFirst = data.split('\n').slice(1).join('\n');
fs.writeFile(filename, linesExceptFirst, function(err, data) { if (err) {/** check and handle err */} });
});
I just came across the need to be able to exclude several lines in a file. Here's how I did it with a simple node function.
const fs = require('fs');
const removeLines = (data, lines = []) => {
return data
.split('\n')
.filter((val, idx) => lines.indexOf(idx) === -1)
.join('\n');
}
fs.readFile(fileName, 'utf8', (err, data) => {
if (err) throw err;
// remove the first line and the 5th and 6th lines in the file
fs.writeFile(fileName, removeLines(data, [0, 4, 5]), 'utf8', function(err) {
if (err) throw err;
console.log("the lines have been removed.");
});
})
use replace
const fs = require('fs');
function readWriteSync() {
var data = fs.readFileSync(filepath, 'utf-8');
// replace 'world' together with the new line character with empty
var newValue = data.replace(/world\n/, '');
fs.writeFileSync(filepath, newValue, 'utf-8');
}
Is there a way to read one symbol at a time in nodejs from file without storing the whole file in memory?
I found an answer for lines
I tried something like this but it doesn't help:
const stream = fs.createReadStream("walmart.dump", {
encoding: 'utf8',
fd: null,
bufferSize: 1,
});
stream.on('data', function(sym){
console.log(sym);
});
Readable stream has a read() method, where you can pass the length, in bytes, of every chunk to be read. For example:
var readable = fs.createReadStream("walmart.dump", {
encoding: 'utf8',
fd: null,
});
readable.on('readable', function() {
var chunk;
while (null !== (chunk = readable.read(1) /* here */)) {
console.log(chunk); // chunk is one byte
}
});
Here's a lower-level way to do it: fs.read(fd, buffer, offset, length, position, callback)
using:
const fs = require('fs');
// open file for reading, returns file descriptor
const fd = fs.openSync('your-file.txt','r');
function readOneCharFromFile(position, cb){
// only need to store one byte (one character)
const b = new Buffer(1);
fs.read(fd, b, 0, 1, position, function(err,bytesRead, buffer){
console.log('data => ', String(buffer));
cb(err,buffer);
});
}
you will have to increment the position, as you read the file, but it will work.
here's a quick example of how to read a whole file, character by character
Just for fun I wrote this complete script to do it, just pass in a different file path, and it should work
const async = require('async');
const fs = require('fs');
const path = require('path');
function read(fd, position, cb) {
let isByteRead = null;
let ret = new Buffer(0);
async.whilst(
function () {
return isByteRead !== false;
},
function (cb) {
readOneCharFromFile(fd, position++, function (err, bytesRead, buffer) {
if(err){
return cb(err);
}
isByteRead = !!bytesRead;
if(isByteRead){
ret = Buffer.concat([ret,buffer]);
}
cb(null);
});
},
function (err) {
cb(err, ret);
}
);
}
function readOneCharFromFile(fd, position, cb) {
// only need to store one byte (one character)
const b = new Buffer(1);
fs.read(fd, b, 0, 1, position, cb);
}
/// use your own file here
const file = path.resolve(__dirname + '/fixtures/abc.txt');
const fd = fs.openSync(file, 'r');
// start reading at position 0, position will be incremented
read(fd, 0, function (err, data) {
if (err) {
console.error(err.stack || err);
}
else {
console.log('data => ', String(data));
}
fs.closeSync(fd);
});
As you can see we increment the position integer every time we read the file. Hopefully the OS keeps the file in memory as we go. Using async.whilst() is OK, but I think for a more functional style it's better not to keep the state in the top of the function (ret and isByteRead). I will leave it as an exercise to the reader to implement this without using those stateful variables.